Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 7 additions & 15 deletions pkg/runtime/toolexec/dispatcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,21 +724,13 @@ func buildMultiContent(text string, images []tools.MediaContent) []chat.MessageP
parts := make([]chat.MessagePart, 0, 1+len(images))
parts = append(parts, chat.MessagePart{Type: chat.MessagePartTypeText, Text: text})
for _, img := range images {
switch {
case img.FilePath != "":
parts = append(parts, chat.MessagePart{
Type: chat.MessagePartTypeText,
Text: fmt.Sprintf("[image saved to %s (%s)]", img.FilePath, img.MimeType),
})
case img.Data != "":
parts = append(parts, chat.MessagePart{
Type: chat.MessagePartTypeImageURL,
ImageURL: &chat.MessageImageURL{
URL: "data:" + img.MimeType + ";base64," + img.Data,
Detail: chat.ImageURLDetailAuto,
},
})
}
parts = append(parts, chat.MessagePart{
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] Missing empty-data guard may produce malformed data URIs

The previous buildMultiContent had a case img.Data != "": guard that silently skipped any MediaContent with an empty Data field. The restored code unconditionally builds a MessagePart for every image:

URL: "data:" + img.MimeType + ";base64," + img.Data,

If img.Data is empty (e.g., an MCP server returns an ImageContent with a zero-length Data slice — base64.StdEncoding.EncodeToString(nil) returns ""), the resulting URL is "data:image/png;base64," — an invalid data URI that will be sent verbatim to the LLM API. Most LLM clients will return an error or silently fail on such a URL.

Since encodeMedia always calls base64.StdEncoding.EncodeToString, this can only trigger when an MCP server sends an empty ImageContent.Data, but the old guard handled that defensively. Consider adding back a check:

if img.Data != "" {
    parts = append(parts, ...)
}

Type: chat.MessagePartTypeImageURL,
ImageURL: &chat.MessageImageURL{
URL: "data:" + img.MimeType + ";base64," + img.Data,
Detail: chat.ImageURLDetailAuto,
},
})
}
return parts
}
Expand Down
114 changes: 10 additions & 104 deletions pkg/tools/mcp/mcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,6 @@ type Toolset struct {

supervisor *lifecycle.Supervisor

// mediaDir is the toolset-scoped temp dir holding spooled media
// payloads. Created lazily on first spool, removed by Stop.
mediaMu sync.Mutex
mediaDir string

mu sync.Mutex

// Cached tools and prompts, invalidated via MCP notifications and
Expand Down Expand Up @@ -431,7 +426,6 @@ func (ts *Toolset) Start(ctx context.Context) error {
// Stop tears the supervisor down. Idempotent.
func (ts *Toolset) Stop(ctx context.Context) error {
slog.DebugContext(ctx, "Stopping MCP toolset", "server", ts.logID)
defer ts.cleanupMediaDir()
if ts.supervisor == nil {
return nil
}
Expand Down Expand Up @@ -700,7 +694,7 @@ func (ts *Toolset) callTool(ctx context.Context, toolCall tools.ToolCall) (*tool
return nil, fmt.Errorf("failed to call tool: %w", err)
}

result := ts.processMCPContent(resp)
result := processMCPContent(resp)
slog.DebugContext(ctx, "MCP tool call completed", "tool", toolCall.Function.Name, "output_length", len(result.Output))
slog.DebugContext(ctx, result.Output)
return result, nil
Expand All @@ -720,13 +714,7 @@ func isInitNotificationSendError(err error) bool {
return false
}

const maxInlineMediaBytes = 256 * 1024

// writeMediaFile is a package-level indirection so tests can simulate
// disk failures without manipulating the filesystem.
var writeMediaFile = defaultWriteMediaFile

func (ts *Toolset) processMCPContent(toolResult *mcp.CallToolResult) *tools.ToolCallResult {
func processMCPContent(toolResult *mcp.CallToolResult) *tools.ToolCallResult {
var text strings.Builder
var images, audios []tools.MediaContent

Expand All @@ -735,9 +723,9 @@ func (ts *Toolset) processMCPContent(toolResult *mcp.CallToolResult) *tools.Tool
case *mcp.TextContent:
text.WriteString(c.Text)
case *mcp.ImageContent:
images = append(images, ts.encodeMedia(c.Data, c.MIMEType))
images = append(images, encodeMedia(c.Data, c.MIMEType))
case *mcp.AudioContent:
audios = append(audios, ts.encodeMedia(c.Data, c.MIMEType))
audios = append(audios, encodeMedia(c.Data, c.MIMEType))
case *mcp.ResourceLink:
if c.Name != "" {
// Escape ] in name and ) in URI to prevent broken markdown links.
Expand Down Expand Up @@ -772,94 +760,12 @@ func (ts *Toolset) processMCPContent(toolResult *mcp.CallToolResult) *tools.Tool
}
}

// encodeMedia keeps small payloads inline and spools larger ones to disk so the
// session and TUI do not retain duplicate base64 copies. Spooled files live
// under a toolset-scoped temp directory removed by Stop.
func (ts *Toolset) encodeMedia(data []byte, mimeType string) tools.MediaContent {
media := tools.MediaContent{MimeType: mimeType}
if len(data) <= maxInlineMediaBytes {
media.Data = base64.StdEncoding.EncodeToString(data)
return media
}

dir, err := ts.ensureMediaDir()
if err == nil {
var path string
path, err = writeMediaFile(dir, data, mimeType)
if err == nil {
media.FilePath = path
return media
}
}
slog.Warn("failed to spool MCP media to disk", "mime_type", mimeType, "bytes", len(data), "error", err)
media.Data = base64.StdEncoding.EncodeToString(data)
return media
}

// ensureMediaDir lazily creates the toolset-scoped temp dir for spooled
// media payloads. The directory is removed by Stop.
func (ts *Toolset) ensureMediaDir() (string, error) {
ts.mediaMu.Lock()
defer ts.mediaMu.Unlock()
if ts.mediaDir != "" {
return ts.mediaDir, nil
}
dir, err := os.MkdirTemp("", "docker-agent-mcp-media-*")
if err != nil {
return "", err
}
ts.mediaDir = dir
return dir, nil
}

// cleanupMediaDir removes the toolset-scoped media spool directory, if any.
func (ts *Toolset) cleanupMediaDir() {
ts.mediaMu.Lock()
dir := ts.mediaDir
ts.mediaDir = ""
ts.mediaMu.Unlock()
if dir == "" {
return
}
if err := os.RemoveAll(dir); err != nil {
slog.Warn("failed to remove MCP media spool directory", "dir", dir, "error", err)
}
}

func defaultWriteMediaFile(dir string, data []byte, mimeType string) (string, error) {
f, err := os.CreateTemp(dir, "media-*"+mediaExtension(mimeType))
if err != nil {
return "", err
}
path := f.Name()
if _, err := f.Write(data); err != nil {
_ = f.Close()
_ = os.Remove(path)
return "", err
}
if err := f.Close(); err != nil {
_ = os.Remove(path)
return "", err
}
return path, nil
}

func mediaExtension(mimeType string) string {
switch mimeType {
case "image/png":
return ".png"
case "image/jpeg":
return ".jpg"
case "image/gif":
return ".gif"
case "image/webp":
return ".webp"
case "audio/wav", "audio/wave", "audio/x-wav":
return ".wav"
case "audio/mpeg", "audio/mp3":
return ".mp3"
default:
return ".bin"
// encodeMedia re-encodes raw bytes (as decoded by the MCP SDK) back to base64
// for our internal MediaContent representation.
func encodeMedia(data []byte, mimeType string) tools.MediaContent {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] Memory pressure regression: large MCP media always held in-memory as base64

The reverted encodeMedia unconditionally base64-encodes all MCP media payloads — no matter how large — directly into a string field held in memory for the lifetime of the session. The spool-to-disk fix (commit 6dd0262) was introduced specifically to avoid this: it kept payloads ≤ 256 KB inline and wrote larger ones to a temp file.

With this revert, a single MCP tool response containing, e.g., a 10 MB screenshot will allocate ~13 MB of base64 in RAM, duplicated across the session history and the TUI. Multiple such responses compound quickly.

If the spool approach is being dropped intentionally, consider at minimum documenting the decision or adding a size cap/warning log for unexpectedly large payloads.

return tools.MediaContent{
Data: base64.StdEncoding.EncodeToString(data),
MimeType: mimeType,
}
}

Expand Down
77 changes: 1 addition & 76 deletions pkg/tools/mcp/mcp_test.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
package mcp

import (
"bytes"
"context"
"errors"
"fmt"
"iter"
"os"
"sync"
"sync/atomic"
"testing"
Expand Down Expand Up @@ -481,7 +478,7 @@ func TestProcessMCPContent(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

result := (&Toolset{}).processMCPContent(tt.input)
result := processMCPContent(tt.input)

assert.Equal(t, tt.wantOutput, result.Output)
assert.Equal(t, tt.wantIsError, result.IsError)
Expand Down Expand Up @@ -539,75 +536,3 @@ func TestCallToolRecoversFromErrSessionMissing(t *testing.T) {
assert.Equal(t, "recovered", result.Output)
assert.Equal(t, int32(2), callCount.Load(), "expected exactly 2 CallTool invocations (1 failed + 1 retry)")
}

func TestProcessMCPContentSpoolsLargeMedia(t *testing.T) {
ts := &Toolset{}
t.Cleanup(ts.cleanupMediaDir)

tests := []struct {
name string
size int
wantInline bool
}{
{"at threshold stays inline", maxInlineMediaBytes, true},
{"above threshold spools", maxInlineMediaBytes + 1, false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
data := bytes.Repeat([]byte("x"), tt.size)
result := ts.processMCPContent(callToolResult(&mcp.ImageContent{Data: data, MIMEType: "image/png"}))

require.Len(t, result.Images, 1)
img := result.Images[0]
assert.Equal(t, "image/png", img.MimeType)

if tt.wantInline {
assert.NotEmpty(t, img.Data)
assert.Empty(t, img.FilePath)
return
}

assert.Empty(t, img.Data)
require.NotEmpty(t, img.FilePath)

got, err := os.ReadFile(img.FilePath)
require.NoError(t, err)
assert.Equal(t, data, got)
})
}
}

func TestEncodeMediaFallsBackToInlineOnDiskFailure(t *testing.T) {
original := writeMediaFile
t.Cleanup(func() { writeMediaFile = original })
writeMediaFile = func(string, []byte, string) (string, error) {
return "", errors.New("disk full")
}

ts := &Toolset{}
t.Cleanup(ts.cleanupMediaDir)

data := bytes.Repeat([]byte("x"), maxInlineMediaBytes+1)
result := ts.processMCPContent(callToolResult(&mcp.ImageContent{Data: data, MIMEType: "image/png"}))

require.Len(t, result.Images, 1)
img := result.Images[0]
assert.Empty(t, img.FilePath)
assert.NotEmpty(t, img.Data, "falls back to inline base64 when disk write fails")
}

func TestToolsetStopRemovesMediaDir(t *testing.T) {
ts := &Toolset{}
data := bytes.Repeat([]byte("x"), maxInlineMediaBytes+1)
media := ts.encodeMedia(data, "image/png")
require.NotEmpty(t, media.FilePath)

_, err := os.Stat(media.FilePath)
require.NoError(t, err)

require.NoError(t, ts.Stop(t.Context()))

_, err = os.Stat(media.FilePath)
assert.True(t, os.IsNotExist(err), "spooled media file should be removed by Stop")
}
7 changes: 2 additions & 5 deletions pkg/tools/tools.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,8 @@ type FunctionCall struct {
// MediaContent represents base64-encoded binary data (image, audio, etc.)
// returned by a tool.
type MediaContent struct {
// Data is the base64-encoded payload. It is kept only for small media; large
// MCP payloads are spooled to FilePath to avoid retaining duplicate base64.
Data string `json:"data,omitempty"`
// FilePath is an optional local file containing the decoded media payload.
FilePath string `json:"filePath,omitempty"`
// Data is the base64-encoded payload.
Data string `json:"data"`
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[LOW] Removing omitempty from MediaContent.Data changes JSON serialisation

The field tag changed from json:"data,omitempty" to json:"data". When Data is an empty string, the old tag would omit the field entirely from JSON output; the new tag always emits "data":"".

In the current code path encodeMedia always populates Data, so this is unlikely to matter in practice. However, any code that constructs a MediaContent with an empty Data (e.g. tests, future zero-value defaults) will now emit the extra field, which may surprise consumers or break strict-equality JSON assertions.

// MimeType identifies the content type (e.g. "image/png", "audio/wav").
MimeType string `json:"mimeType"`
}
Expand Down
Loading