Fix gemini message conversion

2025-12-20 12:21:13 -05:00 · 2024-09-12 11:49:32 -04:00
parent fbb7620fa1
commit 3b004aed37
5 changed files with 121 additions and 49 deletions
--- a/src/api/openrouter.ts
+++ b/src/api/openrouter.ts
@@ -70,6 +70,27 @@ export class OpenRouterHandler implements ApiHandler {

 		const anthropicMessage = convertToAnthropicMessage(completion)

+		// Check if the model is Gemini Flash and remove extra escapes in tool result args
+		// switch (this.getModel().id) {
+		// 	case "google/gemini-pro-1.5":
+		// 	case "google/gemini-flash-1.5":
+		// 		const content = anthropicMessage.content
+		// 		for (const block of content) {
+		// 			if (
+		// 				block.type === "tool_use" &&
+		// 				typeof block.input === "object" &&
+		// 				block.input !== null &&
+		// 				"content" in block.input &&
+		// 				typeof block.input.content === "string"
+		// 			) {
+		// 				block.input.content = unescapeGeminiContent(block.input.content)
+		// 			}
+		// 		}
+		// 		break
+		// 	default:
+		// 		break
+		// }
+
 		return { message: anthropicMessage }
 	}

--- a/src/shared/api.ts
+++ b/src/shared/api.ts
@@ -201,17 +201,22 @@ export const openRouterModels = {
 	// 	outputPrice: 0.06,
 	// },
 	// OpenRouter needs to fix mapping gemini 1.5 responses for tool calls properly, they return content with line breaks formatted wrong (too many escapes), and throw errors for being in the wrong order when they're not. They also cannot handle feedback given to a request with multiple tools. Giving feedback to one tool use requests works fine. ("Please ensure that function response turn comes immediately after a function call turn. And the number of function response parts should be equal to number of function call parts of the function call turn.")
+	// UPDATE: I keep getting "400: Please ensure that function call turn comes immediately after a user turn or after a function response turn.", which gets fixed as soon as i switch to openrouter/claude, so it's obviously an error on openrouters end transforming the message structure. This is likely the culprit behind the tool order error people have seen with gpt4o.
 	// "google/gemini-pro-1.5": {
 	// 	maxTokens: 8192,
-	// 	supportsImages: false, // "Function Calling is not supported with non-text input"
+	// 	contextWindow: 2_097_152,
+	// 	supportsImages: true, // "Function Calling is not supported with non-text input"
+	// 	supportsPromptCache: false,
 	// 	inputPrice: 2.5,
 	// 	outputPrice: 7.5,
 	// },
 	// "google/gemini-flash-1.5": {
 	// 	maxTokens: 8192,
-	// 	supportsImages: false, // "Function Calling is not supported with non-text input"
-	// 	inputPrice: 0.25,
-	// 	outputPrice: 0.75,
+	// 	contextWindow: 1_048_576,
+	// 	supportsImages: true, // "Function Calling is not supported with non-text input"
+	// 	supportsPromptCache: false,
+	// 	inputPrice: 0.0375,
+	// 	outputPrice: 0.15,
 	// },
 	// "google/gemini-pro": {
 	// 	maxTokens: 8192,
--- a/src/utils/gemini-format.ts
+++ b/src/utils/gemini-format.ts
@@ -1,5 +1,15 @@
 import { Anthropic } from "@anthropic-ai/sdk"
-import { Content, EnhancedGenerateContentResponse, FunctionDeclaration, Part, SchemaType } from "@google/generative-ai"
+import {
+	Content,
+	EnhancedGenerateContentResponse,
+	FunctionCallPart,
+	FunctionDeclaration,
+	FunctionResponsePart,
+	InlineDataPart,
+	Part,
+	SchemaType,
+	TextPart,
+} from "@google/generative-ai"

 export function convertAnthropicContentToGemini(
 	content:
@@ -12,12 +22,12 @@ export function convertAnthropicContentToGemini(
 		  >
 ): Part[] {
 	if (typeof content === "string") {
-		return [{ text: content }]
+		return [{ text: content } as TextPart]
 	}
-	return content.map((block) => {
+	return content.flatMap((block) => {
 		switch (block.type) {
 			case "text":
-				return { text: block.text }
+				return { text: block.text } as TextPart
 			case "image":
 				if (block.source.type !== "base64") {
 					throw new Error("Unsupported image source type")
@@ -27,22 +37,55 @@ export function convertAnthropicContentToGemini(
 						data: block.source.data,
 						mimeType: block.source.media_type,
 					},
-				}
+				} as InlineDataPart
 			case "tool_use":
 				return {
 					functionCall: {
 						name: block.name,
 						args: block.input,
 					},
-				} as Part
+				} as FunctionCallPart
 			case "tool_result":
+				const name = block.tool_use_id.split("-")[0]
+				if (!block.content) {
+					return []
+				}
+				if (typeof block.content === "string") {
 					return {
 						functionResponse: {
-						name: block.tool_use_id,
+							name,
 							response: {
+								name,
 								content: block.content,
 							},
 						},
+					} as FunctionResponsePart
+				} else {
+					// The only case when tool_result could be array is when the tool failed and we're providing ie user feedback potentially with images
+					const textParts = block.content.filter((part) => part.type === "text")
+					const imageParts = block.content.filter((part) => part.type === "image")
+					const text = textParts.length > 0 ? textParts.map((part) => part.text).join("\n\n") : ""
+					const imageText = imageParts.length > 0 ? "\n\n(See next part for image)" : ""
+					return [
+						{
+							functionResponse: {
+								name,
+								response: {
+									name,
+									content: text + imageText,
+								},
+							},
+						} as FunctionResponsePart,
+						...imageParts.map(
+							(part) =>
+								({
+									inlineData: {
+										data: part.source.data,
+										mimeType: part.source.media_type,
+									},
+								} as InlineDataPart)
+						),
+					]
 				}
 			default:
 				throw new Error(`Unsupported content block type: ${(block as any).type}`)
@@ -52,7 +95,7 @@ export function convertAnthropicContentToGemini(

 export function convertAnthropicMessageToGemini(message: Anthropic.Messages.MessageParam): Content {
 	return {
-		role: message.role === "assistant" ? "model" : message.role,
+		role: message.role === "assistant" ? "model" : "user",
 		parts: convertAnthropicContentToGemini(message.content),
 	}
 }
@@ -77,6 +120,13 @@ export function convertAnthropicToolToGemini(tool: Anthropic.Messages.Tool): Fun
 	}
 }

+/*
+It looks like gemini likes to double escape certain characters when writing file contents: https://discuss.ai.google.dev/t/function-call-string-property-is-double-escaped/37867
+*/
+export function unescapeGeminiContent(content: string) {
+	return content.replace(/\\n/g, "\n").replace(/\\'/g, "'").replace(/\\"/g, '"')
+}
+
 export function convertGeminiResponseToAnthropic(
 	response: EnhancedGenerateContentResponse
 ): Anthropic.Messages.Message {
@@ -92,9 +142,12 @@ export function convertGeminiResponseToAnthropic(
 	const functionCalls = response.functionCalls()
 	if (functionCalls) {
 		functionCalls.forEach((call, index) => {
+			if ("content" in call.args && typeof call.args.content === "string") {
+				call.args.content = unescapeGeminiContent(call.args.content)
+			}
 			content.push({
 				type: "tool_use",
-				id: `tool_${index}`,
+				id: `${call.name}-${index}-${Date.now()}`,
 				name: call.name,
 				input: call.args,
 			})
--- a/webview-ui/src/components/ChatRow.tsx
+++ b/webview-ui/src/components/ChatRow.tsx
@@ -220,7 +220,6 @@ const ChatRowContent = ({ message, isExpanded, onToggleExpand, lastModifiedMessa
 								onClick={() => {
 									vscode.postMessage({ type: "openFile", text: tool.content })
 								}}>
-								<div style={{ display: "flex", alignItems: "center" }}>
 								<span
 									style={{
 										whiteSpace: "nowrap",
@@ -232,7 +231,6 @@ const ChatRowContent = ({ message, isExpanded, onToggleExpand, lastModifiedMessa
 									}}>
 									{removeLeadingNonAlphanumeric(tool.path ?? "") + "\u200E"}
 								</span>
-								</div>
 								<span
 									className={`codicon codicon-link-external`}
 									style={{ fontSize: 13.5, margin: "1px 0" }}></span>
--- a/webview-ui/src/components/CodeAccordian.tsx
+++ b/webview-ui/src/components/CodeAccordian.tsx
@@ -49,24 +49,19 @@ const CodeAccordian = ({ code, diff, language, path, isFeedback, isExpanded, onT
 						msUserSelect: "none",
 					}}
 					onClick={onToggleExpand}>
-					<div style={{ display: "flex", alignItems: "center" }}>
-						{isFeedback && (
-							<span className="codicon codicon-feedback" style={{ marginRight: "6px" }}></span>
-						)}
+					{isFeedback && <span className="codicon codicon-feedback" style={{ marginRight: "6px" }}></span>}
 					<span
 						style={{
 							whiteSpace: "nowrap",
 							overflow: "hidden",
 							textOverflow: "ellipsis",
 							marginRight: "8px",
-								// fontSize: "11px",
 							// trick to get ellipsis at beginning of string
 							direction: "rtl",
 							textAlign: "left",
 						}}>
 						{isFeedback ? "User Edits" : removeLeadingNonAlphanumeric(path ?? "") + "\u200E"}
 					</span>
-					</div>
 					<span className={`codicon codicon-chevron-${isExpanded ? "up" : "down"}`}></span>
 				</div>
 			)}