Remove sharp processing and use original images in requests

This commit is contained in:
Saoud Rizwan
2024-08-08 07:15:14 -04:00
parent 77cbbbfe49
commit b6a8c03317
9 changed files with 49 additions and 606 deletions

View File

@@ -311,10 +311,15 @@ export class ClaudeDev {
private formatImagesIntoBlocks(images?: string[]): Anthropic.ImageBlockParam[] {
return images
? images.map((base64) => ({
type: "image",
source: { type: "base64", media_type: "image/webp", data: base64 },
}))
? images.map((dataUrl) => {
// data:image/png;base64,base64string
const [rest, base64] = dataUrl.split(",")
const mimeType = rest.split(":")[1].split(";")[0]
return {
type: "image",
source: { type: "base64", media_type: mimeType, data: base64 },
} as Anthropic.ImageBlockParam
})
: []
}

View File

@@ -149,7 +149,7 @@ export class OpenRouterHandler implements ApiHandler {
if (part.type === "image") {
return {
type: "image_url",
image_url: { url: "data:image/webp;base64," + part.source.data },
image_url: { url: `data:${part.source.media_type};base64,${part.source.data}` },
}
}
return { type: "text", text: part.text }
@@ -161,7 +161,7 @@ export class OpenRouterHandler implements ApiHandler {
toolMessages.forEach((toolMessage) => {
// The Anthropic SDK allows tool results to be a string or an array of text and image blocks, enabling rich and structured content. In contrast, the OpenAI SDK only supports tool results as a single string, so we map the Anthropic tool result parts into one concatenated string to maintain compatibility.
let content: string
let images: string[] = []
let images: Anthropic.Messages.ImageBlockParam[] = []
if (typeof toolMessage.content === "string") {
content = toolMessage.content
} else {
@@ -169,7 +169,7 @@ export class OpenRouterHandler implements ApiHandler {
toolMessage.content
?.map((part) => {
if (part.type === "image") {
images.push(part.source.data)
images.push(part)
return "(see following user message for image)"
}
return part.text
@@ -185,9 +185,9 @@ export class OpenRouterHandler implements ApiHandler {
if (images.length > 0) {
openAiMessages.push({
role: "user",
content: images.map((image) => ({
content: images.map((part) => ({
type: "image_url",
image_url: { url: "data:image/webp;base64," + image },
image_url: { url: `data:${part.source.media_type};base64,${part.source.data}` },
})),
})
}

View File

@@ -1,14 +1,11 @@
import { Anthropic } from "@anthropic-ai/sdk"
import os from "os"
import * as path from "path"
import * as vscode from "vscode"
import { Uri, Webview } from "vscode"
import { ClaudeDev } from "../ClaudeDev"
import { ApiProvider } from "../shared/api"
import { ExtensionMessage } from "../shared/ExtensionMessage"
import { WebviewMessage } from "../shared/WebviewMessage"
import { processPastedImages, selectAndProcessImages } from "../utils/process-images"
import { downloadTask } from "../utils/export-markdown"
import { selectImages } from "../utils/process-images"
/*
https://github.com/microsoft/vscode-webview-ui-toolkit-samples/blob/main/default/weather-webview/src/providers/WeatherViewProvider.ts
@@ -301,16 +298,8 @@ export class ClaudeDevProvider implements vscode.WebviewViewProvider {
downloadTask(this.claudeDev?.apiConversationHistory ?? [])
break
case "selectImages":
const images = await selectAndProcessImages()
const images = await selectImages()
await this.postMessageToWebview({ type: "selectedImages", images })
break
case "processPastedImages":
const pastedImages = message.images ?? []
if (pastedImages.length > 0) {
const processedImages = await processPastedImages(pastedImages)
await this.postMessageToWebview({ type: "selectedImages", images: processedImages })
}
break
// Add more switch case statements here as more webview message commands
// are created within the webview context (i.e. inside media/main.js)

View File

@@ -11,7 +11,6 @@ export interface WebviewMessage {
| "didShowAnnouncement"
| "downloadTask"
| "selectImages"
| "processPastedImages"
text?: string
askResponse?: ClaudeAskResponse
apiConfiguration?: ApiConfiguration

View File

@@ -1,13 +1,13 @@
import * as vscode from "vscode"
import fs from "fs/promises"
import sharp from "sharp"
import * as path from "path"
export async function selectAndProcessImages(): Promise<string[]> {
export async function selectImages(): Promise<string[]> {
const options: vscode.OpenDialogOptions = {
canSelectMany: true,
openLabel: "Select",
filters: {
Images: ["png", "jpg", "jpeg", "gif", "webp", "tiff", "avif", "svg"], // sharp can convert these to webp which both anthropic and openrouter support
Images: ["png", "jpg", "jpeg", "webp"], // supported by anthropic and openrouter
},
}
@@ -20,45 +20,26 @@ export async function selectAndProcessImages(): Promise<string[]> {
return await Promise.all(
fileUris.map(async (uri) => {
const imagePath = uri.fsPath
const originalBuffer = await fs.readFile(imagePath)
return convertToWebpBase64(originalBuffer)
const buffer = await fs.readFile(imagePath)
const base64 = buffer.toString("base64")
const mimeType = getMimeType(imagePath)
const dataUrl = `data:${mimeType};base64,${base64}`
return dataUrl
})
)
}
export async function processPastedImages(base64Strings: string[]): Promise<string[]> {
return await Promise.all(
base64Strings.map(async (base64) => {
const buffer = Buffer.from(base64, "base64")
return convertToWebpBase64(buffer)
})
)
}
async function convertToWebpBase64(buffer: Buffer): Promise<string> {
const processedBuffer = await sharp(buffer)
/*
Anthropic docs recommendations:
- To improve time-to-first-token resize images to no more than 1.15 megapixels (and within 1568 pixels in both dimensions)
- WebP is a newer image format that's more efficient than PNG and JPEG, so ideal for keeping token usage low. (ive seen the following compression decrease size by 10x)
*/
.resize(1568, 1568, {
fit: "inside", // maintain aspect ratio
withoutEnlargement: true, // don't enlarge smaller images
})
.webp({
// NOTE: consider increasing effort from 4 to 6 (max), this may increase processing time by up to ~500ms
quality: 80,
})
.toBuffer()
const base64 = processedBuffer.toString("base64")
// console.log({
// originalSize: buffer.length,
// processedSize: processedBuffer.length,
// base64,
// })
return base64
function getMimeType(filePath: string): string {
const ext = path.extname(filePath).toLowerCase()
switch (ext) {
case ".png":
return "image/png"
case ".jpeg":
case ".jpg":
return "image/jpeg"
case ".webp":
return "image/webp"
default:
throw new Error(`Unsupported file type: ${ext}`)
}
}