Add ability to attach images to messages

2026-02-05 20:15:12 -05:00 · 2024-08-08 02:44:51 -04:00
parent 9acae31fbb
commit 911dd159cd
16 changed files with 1129 additions and 179 deletions
--- a/src/ClaudeDev.ts
+++ b/src/ClaudeDev.ts
@@ -56,6 +56,7 @@ RULES
 - NEVER end completion_attempt with a question or request to engage in further conversation! Formulate the end of your result in a way that is final and does not require further input from the user. 
 - NEVER start your responses with affirmations like "Certaintly", "Okay", "Sure", "Great", etc. You should NOT be conversational in your responses, but rather direct and to the point.
 - Feel free to use markdown as much as you'd like in your responses. When using code blocks, always include a language specifier.
+- When presented with images, utilize your vision capabilities to thoroughly examine them and extract meaningful information. Incorporate these insights into your thought process as you accomplish the user's task.

 ====

@@ -229,6 +230,8 @@ const tools: Tool[] = [
 	},
 ]

+type ToolResponse = string | Array<Anthropic.TextBlockParam | Anthropic.ImageBlockParam>
+
 export class ClaudeDev {
 	private api: ApiHandler
 	private maxRequestsPerTask: number
@@ -237,6 +240,7 @@ export class ClaudeDev {
 	claudeMessages: ClaudeMessage[] = []
 	private askResponse?: ClaudeAskResponse
 	private askResponseText?: string
+	private askResponseImages?: string[]
 	private lastMessageTs?: number
 	private providerRef: WeakRef<ClaudeDevProvider>
 	abort: boolean = false
@@ -245,13 +249,14 @@ export class ClaudeDev {
 		provider: ClaudeDevProvider,
 		task: string,
 		apiConfiguration: ApiConfiguration,
-		maxRequestsPerTask?: number
+		maxRequestsPerTask?: number,
+		images?: string[]
 	) {
 		this.providerRef = new WeakRef(provider)
 		this.api = buildApiHandler(apiConfiguration)
 		this.maxRequestsPerTask = maxRequestsPerTask ?? DEFAULT_MAX_REQUESTS_PER_TASK

-		this.startTask(task)
+		this.startTask(task, images)
 	}

 	updateApi(apiConfiguration: ApiConfiguration) {
@@ -262,18 +267,23 @@ export class ClaudeDev {
 		this.maxRequestsPerTask = maxRequestsPerTask ?? DEFAULT_MAX_REQUESTS_PER_TASK
 	}

-	async handleWebviewAskResponse(askResponse: ClaudeAskResponse, text?: string) {
+	async handleWebviewAskResponse(askResponse: ClaudeAskResponse, text?: string, images?: string[]) {
 		this.askResponse = askResponse
 		this.askResponseText = text
+		this.askResponseImages = images
 	}

-	async ask(type: ClaudeAsk, question: string): Promise<{ response: ClaudeAskResponse; text?: string }> {
+	async ask(
+		type: ClaudeAsk,
+		question: string
+	): Promise<{ response: ClaudeAskResponse; text?: string; images?: string[] }> {
 		// If this ClaudeDev instance was aborted by the provider, then the only thing keeping us alive is a promise still running in the background, in which case we don't want to send its result to the webview as it is attached to a new instance of ClaudeDev now. So we can safely ignore the result of any active promises, and this class will be deallocated. (Although we set claudeDev = undefined in provider, that simply removes the reference to this instance, but the instance is still alive until this promise resolves or rejects.)
 		if (this.abort) {
 			throw new Error("ClaudeDev instance aborted")
 		}
 		this.askResponse = undefined
 		this.askResponseText = undefined
+		this.askResponseImages = undefined
 		const askTs = Date.now()
 		this.lastMessageTs = askTs
 		this.claudeMessages.push({ ts: askTs, type: "ask", ask: type, text: question })
@@ -282,23 +292,44 @@ export class ClaudeDev {
 		if (this.lastMessageTs !== askTs) {
 			throw new Error("Current ask promise was ignored") // could happen if we send multiple asks in a row i.e. with command_output. It's important that when we know an ask could fail, it is handled gracefully
 		}
-		const result = { response: this.askResponse!, text: this.askResponseText }
+		const result = { response: this.askResponse!, text: this.askResponseText, images: this.askResponseImages }
 		this.askResponse = undefined
 		this.askResponseText = undefined
+		this.askResponseImages = undefined
 		return result
 	}

-	async say(type: ClaudeSay, text?: string): Promise<undefined> {
+	async say(type: ClaudeSay, text?: string, images?: string[]): Promise<undefined> {
 		if (this.abort) {
 			throw new Error("ClaudeDev instance aborted")
 		}
 		const sayTs = Date.now()
 		this.lastMessageTs = sayTs
-		this.claudeMessages.push({ ts: sayTs, type: "say", say: type, text: text })
+		this.claudeMessages.push({ ts: sayTs, type: "say", say: type, text: text, images })
 		await this.providerRef.deref()?.postStateToWebview()
 	}

-	private async startTask(task: string): Promise<void> {
+	private formatImagesIntoBlocks(images?: string[]): Anthropic.ImageBlockParam[] {
+		return images
+			? images.map((base64) => ({
+					type: "image",
+					source: { type: "base64", media_type: "image/webp", data: base64 },
+			  }))
+			: []
+	}
+
+	private formatIntoToolResponse(text?: string, images?: string[]): ToolResponse {
+		if (images && images.length > 0) {
+			const textBlock: Anthropic.TextBlockParam = { type: "text", text: text ?? "" }
+			const imageBlocks: Anthropic.ImageBlockParam[] = this.formatImagesIntoBlocks(images)
+			// "Just as with document-query placement, Claude works best when images come before text. Images placed after text or interpolated with text will still perform well, but if your use case allows it, we recommend an image-then-text structure."
+			return [...imageBlocks, textBlock]
+		} else {
+			return text ?? ""
+		}
+	}
+
+	private async startTask(task: string, images?: string[]): Promise<void> {
 		// conversationHistory (for API) and claudeMessages (for webview) need to be in sync
 		// if the extension process were killed, then on restart the claudeMessages might not be empty, so we need to set it to [] when we create a new ClaudeDev client (otherwise webview would show stale messages from previous session)
 		this.claudeMessages = []
@@ -306,19 +337,22 @@ export class ClaudeDev {
 		await this.providerRef.deref()?.postStateToWebview()

 		// This first message kicks off a task, it is not included in every subsequent message.
-		let userPrompt = `Task: \"${task}\"`
+
+		let textBlock: Anthropic.TextBlockParam = { type: "text", text: `Task: \"${task}\"` }
+		let imageBlocks: Anthropic.ImageBlockParam[] = this.formatImagesIntoBlocks(images)

 		// TODO: create tools that let Claude interact with VSCode (e.g. open a file, list open files, etc.)
 		//const openFiles = vscode.window.visibleTextEditors?.map((editor) => editor.document.uri.fsPath).join("\n")

-		await this.say("text", task)
+		await this.say("text", task, images)

 		let totalInputTokens = 0
 		let totalOutputTokens = 0

 		while (this.requestCount < this.maxRequestsPerTask) {
 			const { didEndLoop, inputTokens, outputTokens } = await this.recursivelyMakeClaudeRequests([
-				{ type: "text", text: userPrompt },
+				...imageBlocks,
+				textBlock,
 			])
 			totalInputTokens += inputTokens
 			totalOutputTokens += outputTokens
@@ -328,6 +362,7 @@ export class ClaudeDev {

 			//const totalCost = this.calculateApiCost(totalInputTokens, totalOutputTokens)
 			if (didEndLoop) {
+				// for now this never happens
 				//this.say("task_completed", `Task completed. Total API usage cost: ${totalCost}`)
 				break
 			} else {
@@ -335,13 +370,16 @@ export class ClaudeDev {
 				// 	"tool",
 				// 	"Claude responded with only text blocks but has not called attempt_completion yet. Forcing him to continue with task..."
 				// )
-				userPrompt =
-					"Ask yourself if you have completed the user's task. If you have, use the attempt_completion tool, otherwise proceed to the next step. (This is an automated message, so do not respond to it conversationally. Just proceed with the task.)"
+				textBlock = {
+					type: "text",
+					text: "Ask yourself if you have completed the user's task. If you have, use the attempt_completion tool, otherwise proceed to the next step. (This is an automated message, so do not respond to it conversationally. Just proceed with the task.)",
+				}
+				imageBlocks = []
 			}
 		}
 	}

-	async executeTool(toolName: ToolName, toolInput: any, isLastWriteToFile: boolean = false): Promise<string> {
+	async executeTool(toolName: ToolName, toolInput: any, isLastWriteToFile: boolean = false): Promise<ToolResponse> {
 		switch (toolName) {
 			case "write_to_file":
 				return this.writeToFile(toolInput.path, toolInput.content, isLastWriteToFile)
@@ -374,7 +412,7 @@ export class ClaudeDev {
 		return totalCost
 	}

-	async writeToFile(relPath: string, newContent: string, isLast: boolean): Promise<string> {
+	async writeToFile(relPath: string, newContent: string, isLast: boolean): Promise<ToolResponse> {
 		try {
 			const absolutePath = path.resolve(cwd, relPath)
 			const fileExists = await fs
@@ -414,7 +452,7 @@ export class ClaudeDev {
 					`${fileName}: Original ↔ Suggested Changes`
 				)

-				const { response, text } = await this.ask(
+				const { response, text, images } = await this.ask(
 					"tool",
 					JSON.stringify({
 						tool: "editedExistingFile",
@@ -426,9 +464,12 @@ export class ClaudeDev {
 					if (isLast) {
 						await this.closeDiffViews()
 					}
-					if (response === "textResponse" && text) {
-						await this.say("user_feedback", text)
-						return `The user denied this operation and provided the following feedback:\n\"${text}\"`
+					if (response === "messageResponse") {
+						await this.say("user_feedback", text, images)
+						return this.formatIntoToolResponse(
+							`The user denied this operation and provided the following feedback:\n\"${text}\"`,
+							images
+						)
 					}
 					return "The user denied this operation."
 				}
@@ -451,7 +492,7 @@ export class ClaudeDev {
 					}),
 					`${fileName}: New File`
 				)
-				const { response, text } = await this.ask(
+				const { response, text, images } = await this.ask(
 					"tool",
 					JSON.stringify({
 						tool: "newFileCreated",
@@ -463,9 +504,12 @@ export class ClaudeDev {
 					if (isLast) {
 						await this.closeDiffViews()
 					}
-					if (response === "textResponse" && text) {
-						await this.say("user_feedback", text)
-						return `The user denied this operation and provided the following feedback:\n\"${text}\"`
+					if (response === "messageResponse") {
+						await this.say("user_feedback", text, images)
+						return this.formatIntoToolResponse(
+							`The user denied this operation and provided the following feedback:\n\"${text}\"`,
+							images
+						)
 					}
 					return "The user denied this operation."
 				}
@@ -497,18 +541,21 @@ export class ClaudeDev {
 		}
 	}

-	async readFile(relPath: string): Promise<string> {
+	async readFile(relPath: string): Promise<ToolResponse> {
 		try {
 			const absolutePath = path.resolve(cwd, relPath)
 			const content = await fs.readFile(absolutePath, "utf-8")
-			const { response, text } = await this.ask(
+			const { response, text, images } = await this.ask(
 				"tool",
 				JSON.stringify({ tool: "readFile", path: this.getReadablePath(relPath), content } as ClaudeSayTool)
 			)
 			if (response !== "yesButtonTapped") {
-				if (response === "textResponse" && text) {
-					await this.say("user_feedback", text)
-					return `The user denied this operation and provided the following feedback:\n\"${text}\"`
+				if (response === "messageResponse") {
+					await this.say("user_feedback", text, images)
+					return this.formatIntoToolResponse(
+						`The user denied this operation and provided the following feedback:\n\"${text}\"`,
+						images
+					)
 				}
 				return "The user denied this operation."
 			}
@@ -520,12 +567,12 @@ export class ClaudeDev {
 		}
 	}

-	async listFilesTopLevel(relDirPath: string): Promise<string> {
+	async listFilesTopLevel(relDirPath: string): Promise<ToolResponse> {
 		try {
 			const absolutePath = path.resolve(cwd, relDirPath)
 			const files = await listFiles(absolutePath, false)
 			const result = this.formatFilesList(absolutePath, files)
-			const { response, text } = await this.ask(
+			const { response, text, images } = await this.ask(
 				"tool",
 				JSON.stringify({
 					tool: "listFilesTopLevel",
@@ -534,9 +581,12 @@ export class ClaudeDev {
 				} as ClaudeSayTool)
 			)
 			if (response !== "yesButtonTapped") {
-				if (response === "textResponse" && text) {
-					await this.say("user_feedback", text)
-					return `The user denied this operation and provided the following feedback:\n\"${text}\"`
+				if (response === "messageResponse") {
+					await this.say("user_feedback", text, images)
+					return this.formatIntoToolResponse(
+						`The user denied this operation and provided the following feedback:\n\"${text}\"`,
+						images
+					)
 				}
 				return "The user denied this operation."
 			}
@@ -553,12 +603,12 @@ export class ClaudeDev {
 		}
 	}

-	async listFilesRecursive(relDirPath: string): Promise<string> {
+	async listFilesRecursive(relDirPath: string): Promise<ToolResponse> {
 		try {
 			const absolutePath = path.resolve(cwd, relDirPath)
 			const files = await listFiles(absolutePath, true)
 			const result = this.formatFilesList(absolutePath, files)
-			const { response, text } = await this.ask(
+			const { response, text, images } = await this.ask(
 				"tool",
 				JSON.stringify({
 					tool: "listFilesRecursive",
@@ -567,9 +617,12 @@ export class ClaudeDev {
 				} as ClaudeSayTool)
 			)
 			if (response !== "yesButtonTapped") {
-				if (response === "textResponse" && text) {
-					await this.say("user_feedback", text)
-					return `The user denied this operation and provided the following feedback:\n\"${text}\"`
+				if (response === "messageResponse") {
+					await this.say("user_feedback", text, images)
+					return this.formatIntoToolResponse(
+						`The user denied this operation and provided the following feedback:\n\"${text}\"`,
+						images
+					)
 				}
 				return "The user denied this operation."
 			}
@@ -633,11 +686,11 @@ export class ClaudeDev {
 		}
 	}

-	async viewSourceCodeDefinitionsTopLevel(relDirPath: string): Promise<string> {
+	async viewSourceCodeDefinitionsTopLevel(relDirPath: string): Promise<ToolResponse> {
 		try {
 			const absolutePath = path.resolve(cwd, relDirPath)
 			const result = await parseSourceCodeForDefinitionsTopLevel(absolutePath)
-			const { response, text } = await this.ask(
+			const { response, text, images } = await this.ask(
 				"tool",
 				JSON.stringify({
 					tool: "viewSourceCodeDefinitionsTopLevel",
@@ -646,9 +699,12 @@ export class ClaudeDev {
 				} as ClaudeSayTool)
 			)
 			if (response !== "yesButtonTapped") {
-				if (response === "textResponse" && text) {
-					await this.say("user_feedback", text)
-					return `The user denied this operation and provided the following feedback:\n\"${text}\"`
+				if (response === "messageResponse") {
+					await this.say("user_feedback", text, images)
+					return this.formatIntoToolResponse(
+						`The user denied this operation and provided the following feedback:\n\"${text}\"`,
+						images
+					)
 				}
 				return "The user denied this operation."
 			}
@@ -665,12 +721,15 @@ export class ClaudeDev {
 		}
 	}

-	async executeCommand(command: string, returnEmptyStringOnSuccess: boolean = false): Promise<string> {
-		const { response, text } = await this.ask("command", command)
+	async executeCommand(command: string, returnEmptyStringOnSuccess: boolean = false): Promise<ToolResponse> {
+		const { response, text, images } = await this.ask("command", command)
 		if (response !== "yesButtonTapped") {
-			if (response === "textResponse" && text) {
-				await this.say("user_feedback", text)
-				return `The user denied this operation and provided the following feedback:\n\"${text}\"`
+			if (response === "messageResponse") {
+				await this.say("user_feedback", text, images)
+				return this.formatIntoToolResponse(
+					`The user denied this operation and provided the following feedback:\n\"${text}\"`,
+					images
+				)
 			}
 			return "The user denied this operation."
 		}
@@ -756,13 +815,13 @@ export class ClaudeDev {
 		}
 	}

-	async askFollowupQuestion(question: string): Promise<string> {
-		const { text } = await this.ask("followup", question)
-		await this.say("user_feedback", text ?? "")
-		return `User's response:\n\"${text}\"`
+	async askFollowupQuestion(question: string): Promise<ToolResponse> {
+		const { text, images } = await this.ask("followup", question)
+		await this.say("user_feedback", text ?? "", images)
+		return this.formatIntoToolResponse(`User's response:\n\"${text}\"`, images)
 	}

-	async attemptCompletion(result: string, command?: string): Promise<string> {
+	async attemptCompletion(result: string, command?: string): Promise<ToolResponse> {
 		let resultToSend = result
 		if (command) {
 			await this.say("completion_result", resultToSend)
@@ -774,12 +833,15 @@ export class ClaudeDev {
 			}
 			resultToSend = ""
 		}
-		const { response, text } = await this.ask("completion_result", resultToSend) // this prompts webview to show 'new task' button, and enable text input (which would be the 'text' here)
+		const { response, text, images } = await this.ask("completion_result", resultToSend) // this prompts webview to show 'new task' button, and enable text input (which would be the 'text' here)
 		if (response === "yesButtonTapped") {
-			return ""
+			return "" // signals to recursive loop to stop (for now this never happens since yesButtonTapped will trigger a new task)
 		}
-		await this.say("user_feedback", text ?? "")
-		return `The user is not pleased with the results. Use the feedback they provided to successfully complete the task, and then attempt completion again.\nUser's feedback:\n\"${text}\"`
+		await this.say("user_feedback", text ?? "", images)
+		return this.formatIntoToolResponse(
+			`The user is not pleased with the results. Use the feedback they provided to successfully complete the task, and then attempt completion again.\nUser's feedback:\n\"${text}\"`,
+			images
+		)
 	}

 	async attemptApiRequest(): Promise<Anthropic.Messages.Message> {
--- a/src/api/anthropic.ts
+++ b/src/api/anthropic.ts
@@ -1,5 +1,5 @@
 import { Anthropic } from "@anthropic-ai/sdk"
-import { ApiHandler } from "."
+import { ApiHandler, withoutImageData } from "."
 import { ApiHandlerOptions } from "../shared/api"

 export class AnthropicHandler implements ApiHandler {
@@ -44,7 +44,7 @@ export class AnthropicHandler implements ApiHandler {
 			model: "claude-3-5-sonnet-20240620",
 			max_tokens: 8192,
 			system: "(see SYSTEM_PROMPT in src/ClaudeDev.ts)",
-			messages: [{ conversation_history: "..." }, { role: "user", content: userContent }],
+			messages: [{ conversation_history: "..." }, { role: "user", content: withoutImageData(userContent) }],
 			tools: "(see tools in src/ClaudeDev.ts)",
 			tool_choice: { type: "auto" },
 		}
--- a/src/api/bedrock.ts
+++ b/src/api/bedrock.ts
@@ -1,7 +1,7 @@
 import AnthropicBedrock from "@anthropic-ai/bedrock-sdk"
 import { Anthropic } from "@anthropic-ai/sdk"
 import { ApiHandlerOptions } from "../shared/api"
-import { ApiHandler } from "."
+import { ApiHandler, withoutImageData } from "."

 // https://docs.anthropic.com/en/api/claude-on-amazon-bedrock
 export class AwsBedrockHandler implements ApiHandler {
@@ -49,7 +49,7 @@ export class AwsBedrockHandler implements ApiHandler {
 			model: "anthropic.claude-3-5-sonnet-20240620-v1:0",
 			max_tokens: 4096,
 			system: "(see SYSTEM_PROMPT in src/ClaudeDev.ts)",
-			messages: [{ conversation_history: "..." }, { role: "user", content: userContent }],
+			messages: [{ conversation_history: "..." }, { role: "user", content: withoutImageData(userContent) }],
 			tools: "(see tools in src/ClaudeDev.ts)",
 			tool_choice: { type: "auto" },
 		}
--- a/src/api/index.ts
+++ b/src/api/index.ts
@@ -34,3 +34,31 @@ export function buildApiHandler(configuration: ApiConfiguration): ApiHandler {
 			return new AnthropicHandler(options)
 	}
 }
+
+export function withoutImageData(
+	userContent: Array<
+		| Anthropic.TextBlockParam
+		| Anthropic.ImageBlockParam
+		| Anthropic.ToolUseBlockParam
+		| Anthropic.ToolResultBlockParam
+	>
+): Array<
+	Anthropic.TextBlockParam | Anthropic.ImageBlockParam | Anthropic.ToolUseBlockParam | Anthropic.ToolResultBlockParam
+> {
+	return userContent.map((part) => {
+		if (part.type === "image") {
+			return { ...part, source: { ...part.source, data: "..." } }
+		} else if (part.type === "tool_result" && typeof part.content !== "string") {
+			return {
+				...part,
+				content: part.content?.map((contentPart) => {
+					if (contentPart.type === "image") {
+						return { ...contentPart, source: { ...contentPart.source, data: "..." } }
+					}
+					return contentPart
+				}),
+			}
+		}
+		return part
+	})
+}
--- a/src/api/openrouter.ts
+++ b/src/api/openrouter.ts
@@ -1,6 +1,6 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 import OpenAI from "openai"
-import { ApiHandler } from "."
+import { ApiHandler, withoutImageData } from "."
 import { ApiHandlerOptions } from "../shared/api"

 export class OpenRouterHandler implements ApiHandler {
@@ -118,6 +118,7 @@ export class OpenRouterHandler implements ApiHandler {
 				openAiMessages.push({ role: anthropicMessage.role, content: anthropicMessage.content })
 			} else {
 				// image_url.url is base64 encoded image data
+				// ensure it contains the content-type of the image: data:image/png;base64,
 				/*
 			{ role: "user", content: "" | { type: "text", text: string } | { type: "image_url", image_url: { url: string } } },
 			 // content required unless tool_calls is present
@@ -146,7 +147,10 @@ export class OpenRouterHandler implements ApiHandler {
 							role: "user",
 							content: nonToolMessages.map((part) => {
 								if (part.type === "image") {
-									return { type: "image_url", image_url: { url: part.source.data } }
+									return {
+										type: "image_url",
+										image_url: { url: "data:image/webp;base64," + part.source.data },
+									}
 								}
 								return { type: "text", text: part.text }
 							}),
@@ -157,6 +161,7 @@ export class OpenRouterHandler implements ApiHandler {
 					toolMessages.forEach((toolMessage) => {
 						// The Anthropic SDK allows tool results to be a string or an array of text and image blocks, enabling rich and structured content. In contrast, the OpenAI SDK only supports tool results as a single string, so we map the Anthropic tool result parts into one concatenated string to maintain compatibility.
 						let content: string
+						let images: string[] = []
 						if (typeof toolMessage.content === "string") {
 							content = toolMessage.content
 						} else {
@@ -164,7 +169,8 @@ export class OpenRouterHandler implements ApiHandler {
 								toolMessage.content
 									?.map((part) => {
 										if (part.type === "image") {
-											return `{ type: "image_url", image_url: { url: ${part.source.data} } }`
+											images.push(part.source.data)
+											return "(see following user message for image)"
 										}
 										return part.text
 									})
@@ -175,6 +181,16 @@ export class OpenRouterHandler implements ApiHandler {
 							tool_call_id: toolMessage.tool_use_id,
 							content: content,
 						})
+						// If tool results contain images, send as a separate user message
+						if (images.length > 0) {
+							openAiMessages.push({
+								role: "user",
+								content: images.map((image) => ({
+									type: "image_url",
+									image_url: { url: "data:image/webp;base64," + image },
+								})),
+							})
+						}
 					})
 				} else if (anthropicMessage.role === "assistant") {
 					const { nonToolMessages, toolMessages } = anthropicMessage.content.reduce<{
@@ -198,7 +214,7 @@ export class OpenRouterHandler implements ApiHandler {
 						content = nonToolMessages
 							.map((part) => {
 								if (part.type === "image") {
-									return `{ type: "image_url", image_url: { url: ${part.source.data} } }`
+									return "" // impossible as the assistant cannot send images
 								}
 								return part.text
 							})
@@ -239,7 +255,7 @@ export class OpenRouterHandler implements ApiHandler {
 		return {
 			model: "anthropic/claude-3.5-sonnet:beta",
 			max_tokens: 4096,
-			messages: [{ conversation_history: "..." }, { role: "user", content: userContent }],
+			messages: [{ conversation_history: "..." }, { role: "user", content: withoutImageData(userContent) }],
 			tools: "(see tools in src/ClaudeDev.ts)",
 			tool_choice: "auto",
 		}
--- a/src/providers/ClaudeDevProvider.ts
+++ b/src/providers/ClaudeDevProvider.ts
@@ -1,12 +1,14 @@
-import { Uri, Webview } from "vscode"
 import { Anthropic } from "@anthropic-ai/sdk"
 import os from "os"
 import * as path from "path"
 import * as vscode from "vscode"
+import { Uri, Webview } from "vscode"
 import { ClaudeDev } from "../ClaudeDev"
 import { ApiProvider } from "../shared/api"
 import { ExtensionMessage } from "../shared/ExtensionMessage"
 import { WebviewMessage } from "../shared/WebviewMessage"
+import { processPastedImages, selectAndProcessImages } from "../utils/process-images"
+import { downloadTask } from "../utils/export-markdown"

 /*
 https://github.com/microsoft/vscode-webview-ui-toolkit-samples/blob/main/default/weather-webview/src/providers/WeatherViewProvider.ts
@@ -134,7 +136,7 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
 		this.outputChannel.appendLine("Webview view resolved")
 	}

-	async initClaudeDevWithTask(task: string) {
+	async initClaudeDevWithTask(task: string, images?: string[]) {
 		await this.clearTask() // ensures that an exising task doesn't exist before starting a new one, although this shouldn't be possible since user must clear task before starting a new one
 		const { apiProvider, apiKey, openRouterApiKey, awsAccessKey, awsSecretKey, awsRegion, maxRequestsPerTask } =
 			await this.getState()
@@ -142,7 +144,8 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
 			this,
 			task,
 			{ apiProvider, apiKey, openRouterApiKey, awsAccessKey, awsSecretKey, awsRegion },
-			maxRequestsPerTask
+			maxRequestsPerTask,
+			images
 		)
 	}

@@ -203,7 +206,8 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
        create a content security policy meta tag so that only loading scripts with a nonce is allowed
        As your extension grows you will likely want to add custom styles, fonts, and/or images to your webview. If you do, you will need to update the content security policy meta tag to explicity allow for these resources. E.g.
                <meta http-equiv="Content-Security-Policy" content="default-src 'none'; style-src ${webview.cspSource}; font-src ${webview.cspSource}; img-src ${webview.cspSource} https:; script-src 'nonce-${nonce}';">
-
+		- 'unsafe-inline' is required for styles due to vscode-webview-toolkit's dynamic style injection
+		- since we pass base64 images to the webview, we need to specify img-src ${webview.cspSource} data:;

        in meta tag we add nonce attribute: A cryptographic nonce (only used once) to allow scripts. The server must generate a unique nonce value each time it transmits a policy. It is critical to provide a nonce that cannot be guessed as bypassing a resource's policy is otherwise trivial.
        */
@@ -217,7 +221,7 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
            <meta charset="utf-8">
            <meta name="viewport" content="width=device-width,initial-scale=1,shrink-to-fit=no">
            <meta name="theme-color" content="#000000">
-            <meta http-equiv="Content-Security-Policy" content="default-src 'none'; font-src ${webview.cspSource}; style-src ${webview.cspSource}; script-src 'nonce-${nonce}';">
+            <meta http-equiv="Content-Security-Policy" content="default-src 'none'; font-src ${webview.cspSource}; style-src ${webview.cspSource} 'unsafe-inline'; img-src ${webview.cspSource} data:; script-src 'nonce-${nonce}';">
            <link rel="stylesheet" type="text/css" href="${stylesUri}">
 			<link href="${codiconsUri}" rel="stylesheet" />
            <title>Claude Dev</title>
@@ -253,7 +257,7 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
 						// Could also do this in extension .ts
 						//this.postMessageToWebview({ type: "text", text: `Extension: ${Date.now()}` })
 						// initializing new instance of ClaudeDev will make sure that any agentically running promises in old instance don't affect our new task. this essentially creates a fresh slate for the new task
-						await this.initClaudeDevWithTask(message.text!)
+						await this.initClaudeDevWithTask(message.text!, message.images)
 						break
 					case "apiConfiguration":
 						if (message.apiConfiguration) {
@@ -282,7 +286,7 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
 						await this.postStateToWebview()
 						break
 					case "askResponse":
-						this.claudeDev?.handleWebviewAskResponse(message.askResponse!, message.text)
+						this.claudeDev?.handleWebviewAskResponse(message.askResponse!, message.text, message.images)
 						break
 					case "clearTask":
 						// newTask will start a new task with a given task text, while clear task resets the current session and allows for a new task to be started
@@ -294,7 +298,19 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
 						await this.postStateToWebview()
 						break
 					case "downloadTask":
-						this.downloadTask()
+						downloadTask(this.claudeDev?.apiConversationHistory ?? [])
+						break
+					case "selectImages":
+						const images = await selectAndProcessImages()
+						await this.postMessageToWebview({ type: "selectedImages", images })
+						break
+					case "processPastedImages":
+						const pastedImages = message.images ?? []
+						if (pastedImages.length > 0) {
+							const processedImages = await processPastedImages(pastedImages)
+							await this.postMessageToWebview({ type: "selectedImages", images: processedImages })
+						}
+
 						break
 					// Add more switch case statements here as more webview message commands
 					// are created within the webview context (i.e. inside media/main.js)
@@ -305,82 +321,6 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
 		)
 	}

-	async downloadTask() {
-		// File name
-		const date = new Date()
-		const month = date.toLocaleString("en-US", { month: "short" }).toLowerCase()
-		const day = date.getDate()
-		const year = date.getFullYear()
-		let hours = date.getHours()
-		const minutes = date.getMinutes().toString().padStart(2, "0")
-		const ampm = hours >= 12 ? "pm" : "am"
-		hours = hours % 12
-		hours = hours ? hours : 12 // the hour '0' should be '12'
-		const fileName = `claude_dev_task_${month}-${day}-${year}_${hours}-${minutes}-${ampm}.md`
-
-		// Generate markdown
-		const conversationHistory = this.claudeDev?.apiConversationHistory || []
-		const markdownContent = conversationHistory
-			.map((message) => {
-				const role = message.role === "user" ? "**User:**" : "**Assistant:**"
-				const content = Array.isArray(message.content)
-					? message.content.map(this.formatContentBlockToMarkdown).join("\n")
-					: message.content
-
-				return `${role}\n\n${content}\n\n`
-			})
-			.join("---\n\n")
-
-		// Prompt user for save location
-		const saveUri = await vscode.window.showSaveDialog({
-			filters: { Markdown: ["md"] },
-			defaultUri: vscode.Uri.file(path.join(os.homedir(), "Downloads", fileName)),
-		})
-
-		if (saveUri) {
-			// Write content to the selected location
-			await vscode.workspace.fs.writeFile(saveUri, Buffer.from(markdownContent))
-			vscode.window.showTextDocument(saveUri, { preview: true })
-		}
-	}
-
-	private formatContentBlockToMarkdown(
-		block:
-			| Anthropic.TextBlockParam
-			| Anthropic.ImageBlockParam
-			| Anthropic.ToolUseBlockParam
-			| Anthropic.ToolResultBlockParam
-	): string {
-		switch (block.type) {
-			case "text":
-				return block.text
-			case "image":
-				return `[Image: ${block.source.media_type}]`
-			case "tool_use":
-				let input: string
-				if (typeof block.input === "object" && block.input !== null) {
-					input = Object.entries(block.input)
-						.map(([key, value]) => `${key.charAt(0).toUpperCase() + key.slice(1)}: ${value}`)
-						.join("\n")
-				} else {
-					input = String(block.input)
-				}
-				return `[Tool Use: ${block.name}]\n${input}`
-			case "tool_result":
-				if (typeof block.content === "string") {
-					return `[Tool Result${block.is_error ? " (Error)" : ""}]\n${block.content}`
-				} else if (Array.isArray(block.content)) {
-					return `[Tool Result${block.is_error ? " (Error)" : ""}]\n${block.content
-						.map(this.formatContentBlockToMarkdown)
-						.join("\n")}`
-				} else {
-					return `[Tool Result${block.is_error ? " (Error)" : ""}]`
-				}
-			default:
-				return "[Unexpected content type]"
-		}
-	}
-
 	async postStateToWebview() {
 		const {
 			apiProvider,
--- a/src/shared/ExtensionMessage.ts
+++ b/src/shared/ExtensionMessage.ts
@@ -4,10 +4,11 @@ import { ApiConfiguration } from "./api"

 // webview will hold state
 export interface ExtensionMessage {
-	type: "action" | "state"
+	type: "action" | "state" | "selectedImages"
 	text?: string
 	action?: "plusButtonTapped" | "settingsButtonTapped" | "didBecomeVisible"
 	state?: ExtensionState
+	images?: string[]
 }

 export interface ExtensionState {
@@ -24,6 +25,7 @@ export interface ClaudeMessage {
 	ask?: ClaudeAsk
 	say?: ClaudeSay
 	text?: string
+	images?: string[]
 }

 export type ClaudeAsk =
--- a/src/shared/WebviewMessage.ts
+++ b/src/shared/WebviewMessage.ts
@@ -10,9 +10,12 @@ export interface WebviewMessage {
 		| "clearTask"
 		| "didShowAnnouncement"
 		| "downloadTask"
+		| "selectImages"
+		| "processPastedImages"
 	text?: string
 	askResponse?: ClaudeAskResponse
 	apiConfiguration?: ApiConfiguration
+	images?: string[]
 }

-export type ClaudeAskResponse = "yesButtonTapped" | "noButtonTapped" | "textResponse"
+export type ClaudeAskResponse = "yesButtonTapped" | "noButtonTapped" | "messageResponse"
--- a/src/utils/export-markdown.ts
+++ b/src/utils/export-markdown.ts
@@ -0,0 +1,79 @@
+import { Anthropic } from "@anthropic-ai/sdk"
+import os from "os"
+import * as path from "path"
+import * as vscode from "vscode"
+
+export async function downloadTask(conversationHistory: Anthropic.MessageParam[]) {
+	// File name
+	const date = new Date()
+	const month = date.toLocaleString("en-US", { month: "short" }).toLowerCase()
+	const day = date.getDate()
+	const year = date.getFullYear()
+	let hours = date.getHours()
+	const minutes = date.getMinutes().toString().padStart(2, "0")
+	const ampm = hours >= 12 ? "pm" : "am"
+	hours = hours % 12
+	hours = hours ? hours : 12 // the hour '0' should be '12'
+	const fileName = `claude_dev_task_${month}-${day}-${year}_${hours}-${minutes}-${ampm}.md`
+
+	// Generate markdown
+	const markdownContent = conversationHistory
+		.map((message) => {
+			const role = message.role === "user" ? "**User:**" : "**Assistant:**"
+			const content = Array.isArray(message.content)
+				? message.content.map(formatContentBlockToMarkdown).join("\n")
+				: message.content
+
+			return `${role}\n\n${content}\n\n`
+		})
+		.join("---\n\n")
+
+	// Prompt user for save location
+	const saveUri = await vscode.window.showSaveDialog({
+		filters: { Markdown: ["md"] },
+		defaultUri: vscode.Uri.file(path.join(os.homedir(), "Downloads", fileName)),
+	})
+
+	if (saveUri) {
+		// Write content to the selected location
+		await vscode.workspace.fs.writeFile(saveUri, Buffer.from(markdownContent))
+		vscode.window.showTextDocument(saveUri, { preview: true })
+	}
+}
+
+function formatContentBlockToMarkdown(
+	block:
+		| Anthropic.TextBlockParam
+		| Anthropic.ImageBlockParam
+		| Anthropic.ToolUseBlockParam
+		| Anthropic.ToolResultBlockParam
+): string {
+	switch (block.type) {
+		case "text":
+			return block.text
+		case "image":
+			return `[Image]`
+		case "tool_use":
+			let input: string
+			if (typeof block.input === "object" && block.input !== null) {
+				input = Object.entries(block.input)
+					.map(([key, value]) => `${key.charAt(0).toUpperCase() + key.slice(1)}: ${value}`)
+					.join("\n")
+			} else {
+				input = String(block.input)
+			}
+			return `[Tool Use: ${block.name}]\n${input}`
+		case "tool_result":
+			if (typeof block.content === "string") {
+				return `[Tool Result${block.is_error ? " (Error)" : ""}]\n${block.content}`
+			} else if (Array.isArray(block.content)) {
+				return `[Tool Result${block.is_error ? " (Error)" : ""}]\n${block.content
+					.map(formatContentBlockToMarkdown)
+					.join("\n")}`
+			} else {
+				return `[Tool Result${block.is_error ? " (Error)" : ""}]`
+			}
+		default:
+			return "[Unexpected content type]"
+	}
+}
--- a/src/utils/process-images.ts
+++ b/src/utils/process-images.ts
@@ -0,0 +1,64 @@
+import * as vscode from "vscode"
+import fs from "fs/promises"
+import sharp from "sharp"
+
+export async function selectAndProcessImages(): Promise<string[]> {
+	const options: vscode.OpenDialogOptions = {
+		canSelectMany: true,
+		openLabel: "Select",
+		filters: {
+			Images: ["png", "jpg", "jpeg", "gif", "webp", "tiff", "avif", "svg"], // sharp can convert these to webp which both anthropic and openrouter support
+		},
+	}
+
+	const fileUris = await vscode.window.showOpenDialog(options)
+
+	if (!fileUris || fileUris.length === 0) {
+		return []
+	}
+
+	return await Promise.all(
+		fileUris.map(async (uri) => {
+			const imagePath = uri.fsPath
+			const originalBuffer = await fs.readFile(imagePath)
+			return convertToWebpBase64(originalBuffer)
+		})
+	)
+}
+
+export async function processPastedImages(base64Strings: string[]): Promise<string[]> {
+	return await Promise.all(
+		base64Strings.map(async (base64) => {
+			const buffer = Buffer.from(base64, "base64")
+			return convertToWebpBase64(buffer)
+		})
+	)
+}
+
+async function convertToWebpBase64(buffer: Buffer): Promise<string> {
+	const processedBuffer = await sharp(buffer)
+		/*
+                Anthropic docs recommendations:
+                - To improve time-to-first-token resize images to no more than 1.15 megapixels (and within 1568 pixels in both dimensions)
+                - WebP is a newer image format that's more efficient than PNG and JPEG, so ideal for keeping token usage low. (ive seen the following compression decrease size by 10x)
+                */
+		.resize(1568, 1568, {
+			fit: "inside", // maintain aspect ratio
+			withoutEnlargement: true, // don't enlarge smaller images
+		})
+		.webp({
+			// NOTE: consider increasing effort from 4 to 6 (max), this may increase processing time by up to ~500ms
+			quality: 80,
+		})
+		.toBuffer()
+
+	const base64 = processedBuffer.toString("base64")
+
+	// console.log({
+	// 	originalSize: buffer.length,
+	// 	processedSize: processedBuffer.length,
+	// 	base64,
+	// })
+
+	return base64
+}