From 3b004aed374aea60dedca17d38edee4b9b6e937a Mon Sep 17 00:00:00 2001 From: Saoud Rizwan <7799382+saoudrizwan@users.noreply.github.com> Date: Thu, 12 Sep 2024 11:49:32 -0400 Subject: [PATCH] Fix gemini message conversion --- src/api/openrouter.ts | 21 ++++++ src/shared/api.ts | 13 +++- src/utils/gemini-format.ts | 81 +++++++++++++++++---- webview-ui/src/components/ChatRow.tsx | 24 +++--- webview-ui/src/components/CodeAccordian.tsx | 31 ++++---- 5 files changed, 121 insertions(+), 49 deletions(-) diff --git a/src/api/openrouter.ts b/src/api/openrouter.ts index f4a5efa..6e43ea6 100644 --- a/src/api/openrouter.ts +++ b/src/api/openrouter.ts @@ -70,6 +70,27 @@ export class OpenRouterHandler implements ApiHandler { const anthropicMessage = convertToAnthropicMessage(completion) + // Check if the model is Gemini Flash and remove extra escapes in tool result args + // switch (this.getModel().id) { + // case "google/gemini-pro-1.5": + // case "google/gemini-flash-1.5": + // const content = anthropicMessage.content + // for (const block of content) { + // if ( + // block.type === "tool_use" && + // typeof block.input === "object" && + // block.input !== null && + // "content" in block.input && + // typeof block.input.content === "string" + // ) { + // block.input.content = unescapeGeminiContent(block.input.content) + // } + // } + // break + // default: + // break + // } + return { message: anthropicMessage } } diff --git a/src/shared/api.ts b/src/shared/api.ts index db40002..a803a11 100644 --- a/src/shared/api.ts +++ b/src/shared/api.ts @@ -201,17 +201,22 @@ export const openRouterModels = { // outputPrice: 0.06, // }, // OpenRouter needs to fix mapping gemini 1.5 responses for tool calls properly, they return content with line breaks formatted wrong (too many escapes), and throw errors for being in the wrong order when they're not. They also cannot handle feedback given to a request with multiple tools. Giving feedback to one tool use requests works fine. ("Please ensure that function response turn comes immediately after a function call turn. And the number of function response parts should be equal to number of function call parts of the function call turn.") + // UPDATE: I keep getting "400: Please ensure that function call turn comes immediately after a user turn or after a function response turn.", which gets fixed as soon as i switch to openrouter/claude, so it's obviously an error on openrouters end transforming the message structure. This is likely the culprit behind the tool order error people have seen with gpt4o. // "google/gemini-pro-1.5": { // maxTokens: 8192, - // supportsImages: false, // "Function Calling is not supported with non-text input" + // contextWindow: 2_097_152, + // supportsImages: true, // "Function Calling is not supported with non-text input" + // supportsPromptCache: false, // inputPrice: 2.5, // outputPrice: 7.5, // }, // "google/gemini-flash-1.5": { // maxTokens: 8192, - // supportsImages: false, // "Function Calling is not supported with non-text input" - // inputPrice: 0.25, - // outputPrice: 0.75, + // contextWindow: 1_048_576, + // supportsImages: true, // "Function Calling is not supported with non-text input" + // supportsPromptCache: false, + // inputPrice: 0.0375, + // outputPrice: 0.15, // }, // "google/gemini-pro": { // maxTokens: 8192, diff --git a/src/utils/gemini-format.ts b/src/utils/gemini-format.ts index dd3207f..8601939 100644 --- a/src/utils/gemini-format.ts +++ b/src/utils/gemini-format.ts @@ -1,5 +1,15 @@ import { Anthropic } from "@anthropic-ai/sdk" -import { Content, EnhancedGenerateContentResponse, FunctionDeclaration, Part, SchemaType } from "@google/generative-ai" +import { + Content, + EnhancedGenerateContentResponse, + FunctionCallPart, + FunctionDeclaration, + FunctionResponsePart, + InlineDataPart, + Part, + SchemaType, + TextPart, +} from "@google/generative-ai" export function convertAnthropicContentToGemini( content: @@ -12,12 +22,12 @@ export function convertAnthropicContentToGemini( > ): Part[] { if (typeof content === "string") { - return [{ text: content }] + return [{ text: content } as TextPart] } - return content.map((block) => { + return content.flatMap((block) => { switch (block.type) { case "text": - return { text: block.text } + return { text: block.text } as TextPart case "image": if (block.source.type !== "base64") { throw new Error("Unsupported image source type") @@ -27,22 +37,55 @@ export function convertAnthropicContentToGemini( data: block.source.data, mimeType: block.source.media_type, }, - } + } as InlineDataPart case "tool_use": return { functionCall: { name: block.name, args: block.input, }, - } as Part + } as FunctionCallPart case "tool_result": - return { - functionResponse: { - name: block.tool_use_id, - response: { - content: block.content, + const name = block.tool_use_id.split("-")[0] + if (!block.content) { + return [] + } + if (typeof block.content === "string") { + return { + functionResponse: { + name, + response: { + name, + content: block.content, + }, }, - }, + } as FunctionResponsePart + } else { + // The only case when tool_result could be array is when the tool failed and we're providing ie user feedback potentially with images + const textParts = block.content.filter((part) => part.type === "text") + const imageParts = block.content.filter((part) => part.type === "image") + const text = textParts.length > 0 ? textParts.map((part) => part.text).join("\n\n") : "" + const imageText = imageParts.length > 0 ? "\n\n(See next part for image)" : "" + return [ + { + functionResponse: { + name, + response: { + name, + content: text + imageText, + }, + }, + } as FunctionResponsePart, + ...imageParts.map( + (part) => + ({ + inlineData: { + data: part.source.data, + mimeType: part.source.media_type, + }, + } as InlineDataPart) + ), + ] } default: throw new Error(`Unsupported content block type: ${(block as any).type}`) @@ -52,7 +95,7 @@ export function convertAnthropicContentToGemini( export function convertAnthropicMessageToGemini(message: Anthropic.Messages.MessageParam): Content { return { - role: message.role === "assistant" ? "model" : message.role, + role: message.role === "assistant" ? "model" : "user", parts: convertAnthropicContentToGemini(message.content), } } @@ -77,6 +120,13 @@ export function convertAnthropicToolToGemini(tool: Anthropic.Messages.Tool): Fun } } +/* +It looks like gemini likes to double escape certain characters when writing file contents: https://discuss.ai.google.dev/t/function-call-string-property-is-double-escaped/37867 +*/ +export function unescapeGeminiContent(content: string) { + return content.replace(/\\n/g, "\n").replace(/\\'/g, "'").replace(/\\"/g, '"') +} + export function convertGeminiResponseToAnthropic( response: EnhancedGenerateContentResponse ): Anthropic.Messages.Message { @@ -92,9 +142,12 @@ export function convertGeminiResponseToAnthropic( const functionCalls = response.functionCalls() if (functionCalls) { functionCalls.forEach((call, index) => { + if ("content" in call.args && typeof call.args.content === "string") { + call.args.content = unescapeGeminiContent(call.args.content) + } content.push({ type: "tool_use", - id: `tool_${index}`, + id: `${call.name}-${index}-${Date.now()}`, name: call.name, input: call.args, }) diff --git a/webview-ui/src/components/ChatRow.tsx b/webview-ui/src/components/ChatRow.tsx index 26e9917..62b332a 100644 --- a/webview-ui/src/components/ChatRow.tsx +++ b/webview-ui/src/components/ChatRow.tsx @@ -220,19 +220,17 @@ const ChatRowContent = ({ message, isExpanded, onToggleExpand, lastModifiedMessa onClick={() => { vscode.postMessage({ type: "openFile", text: tool.content }) }}> -