Fix sliding window context management handling of images and get more accurate token estimates

2025-12-21 04:41:16 -05:00 · 2024-08-26 22:49:13 -04:00
parent 63f3589f89
commit 273eb3acad
3 changed files with 142 additions and 10 deletions
--- a/src/utils/context-management.ts
+++ b/src/utils/context-management.ts
@@ -1,5 +1,8 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 import { countTokens } from "@anthropic-ai/tokenizer"
+import { Buffer } from "buffer"
+import sizeOf from "image-size"
+import cloneDeep from "clone-deep"

 export function slidingWindowContextManagement(
 	contextWindow: number,
@@ -18,28 +21,31 @@ export function slidingWindowContextManagement(
 	}

 	// If over limit, remove messages starting from the third message onwards (task and claude's step-by-step thought process are important to keep in context)
-	const newMessages = [...messages]
+	const newMessages = cloneDeep(messages) // since we're manipulating nested objects and arrays, need to deep clone to prevent mutating original history
 	let index = 2
 	while (totalMessageTokens > availableTokens && index < newMessages.length) {
 		const messageToEmpty = newMessages[index]
 		const originalTokens = countMessageTokens(messageToEmpty)
 		// Empty the content of the message (messages must be in a specific order so we can't just remove)
 		if (typeof messageToEmpty.content === "string") {
-			messageToEmpty.content = ""
+			messageToEmpty.content = "(truncated due to context limits)"
 		} else if (Array.isArray(messageToEmpty.content)) {
 			messageToEmpty.content = messageToEmpty.content.map((item) => {
 				if (typeof item === "string") {
 					return {
 						type: "text",
-						text: "(truncated due to context window)",
+						text: "(truncated due to context limits)",
 					} as Anthropic.Messages.TextBlockParam
 				} else if (item.type === "text") {
 					return {
 						type: "text",
-						text: "(truncated due to context window)",
+						text: "(truncated due to context limits)",
 					} as Anthropic.Messages.TextBlockParam
 				} else if (item.type === "image") {
-					return { ...item, source: { type: "base64", data: "" } } as Anthropic.Messages.ImageBlockParam
+					return {
+						type: "text",
+						text: "(image removed due to context limits)",
+					} as Anthropic.Messages.TextBlockParam
 				} else if (item.type === "tool_use") {
 					return { ...item, input: {} } as Anthropic.Messages.ToolUseBlockParam
 				} else if (item.type === "tool_result") {
@@ -48,9 +54,9 @@ export function slidingWindowContextManagement(
 						content: Array.isArray(item.content)
 							? item.content.map((contentItem) =>
 									contentItem.type === "text"
-										? { ...contentItem, text: "(truncated due to context window)" }
+										? { type: "text", text: "(truncated due to context limits)" }
 										: contentItem.type === "image"
-										? { ...contentItem, source: { type: "base64", data: "" } }
+										? { type: "text", text: "(image removed due to context limits)" }
 										: contentItem
 							  )
 							: "",
@@ -69,7 +75,50 @@ export function slidingWindowContextManagement(
 function countMessageTokens(message: Anthropic.Messages.MessageParam): number {
 	if (typeof message.content === "string") {
 		return countTokens(message.content)
+	} else if (Array.isArray(message.content)) {
+		return message.content.reduce((sum, item) => {
+			if (typeof item === "string") {
+				return sum + countTokens(item)
+			} else if (item.type === "text") {
+				return sum + countTokens(item.text)
+			} else if (item.type === "image") {
+				return sum + estimateImageTokens(item.source.data)
+			} else if (item.type === "tool_use") {
+				return sum + countTokens(JSON.stringify(item.input))
+			} else if (item.type === "tool_result") {
+				if (Array.isArray(item.content)) {
+					return (
+						sum +
+						item.content.reduce((contentSum, contentItem) => {
+							if (contentItem.type === "text") {
+								return contentSum + countTokens(contentItem.text)
+							} else if (contentItem.type === "image") {
+								return contentSum + estimateImageTokens(contentItem.source.data)
+							}
+							return contentSum + countTokens(JSON.stringify(contentItem))
+						}, 0)
+					)
+				} else {
+					return sum + countTokens(item.content || "")
+				}
+			} else {
+				return sum + countTokens(JSON.stringify(item))
+			}
+		}, 0)
 	} else {
 		return countTokens(JSON.stringify(message.content))
 	}
 }
+
+function estimateImageTokens(base64: string): number {
+	const base64Data = base64.split(";base64,").pop()
+	if (base64Data) {
+		const buffer = Buffer.from(base64Data, "base64")
+		const dimensions = sizeOf(buffer)
+		if (dimensions.width && dimensions.height) {
+			// "you can estimate the number of tokens used through this algorithm: tokens = (width px * height px)/750"
+			return Math.ceil((dimensions.width * dimensions.height) / 750)
+		}
+	}
+	return countTokens(base64)
+}