From 7cd79b35bc3c240119ef3f23429919ec4604d5ae Mon Sep 17 00:00:00 2001 From: Sayt-0 Date: Mon, 22 Jun 2026 14:31:13 +0200 Subject: [PATCH] fix: forward image and PDF attachments for DMR models DMR-hosted models are not in the models.dev catalog, so the capability lookup in modelinfo.LoadCaps always missed for "dmr/" IDs and the conversion path fell back to a text-only strategy, silently dropping image and PDF attachments (issue #2739). Let each DMR model declare its multimodal support explicitly via provider_opts.supports_images and provider_opts.supports_pdf. The DMR client parses and validates these at creation time, stores them as the model's attachment capabilities, and injects them into message conversion through the new oaistream.ConvertMessagesWithCaps, bypassing the models.dev lookup that can never resolve a DMR model. While here, the oaistream Chat Completions path now forwards PDFs as a file content part (base64 data URI plus filename) instead of dropping them. This is gated by the resolved capabilities, so it only triggers for models declared PDF-capable: DMR via provider_opts, or catalog models that models.dev reports as PDF-capable on the legacy Chat Completions path. The conservative text-only default is unchanged: a model with no declared capability still drops image and PDF attachments. Fixes #2739. --- agent-schema.json | 2 +- examples/dmr.yaml | 11 + pkg/model/provider/dmr/attachments_test.go | 217 ++++++++++++++++++ pkg/model/provider/dmr/client.go | 21 +- pkg/model/provider/dmr/configure.go | 40 ++++ pkg/model/provider/oaistream/attachments.go | 52 +++-- .../provider/oaistream/attachments_test.go | 91 ++++++++ pkg/model/provider/oaistream/messages.go | 29 ++- 8 files changed, 425 insertions(+), 38 deletions(-) create mode 100644 pkg/model/provider/dmr/attachments_test.go diff --git a/agent-schema.json b/agent-schema.json index 86fbaa22a..932677fdd 100644 --- a/agent-schema.json +++ b/agent-schema.json @@ -1422,7 +1422,7 @@ }, "provider_opts": { "type": "object", - "description": "Provider-specific options. Sampling parameters: top_k (integer, supported by anthropic, google, amazon-bedrock, and custom OpenAI-compatible providers like vLLM/Ollama), repetition_penalty (float, forwarded to custom OpenAI-compatible providers), min_p (float, forwarded to custom providers), seed (integer, forwarded to OpenAI). Lifecycle: unload_api (string) overrides the unload endpoint inherited from the provider config (relative path resolved against base_url's scheme+host, or an absolute URL); used by the runtime's `unload` on_agent_switch builtin hook to release model resources between agent switches. Infrastructure options: http_headers (map of string to string, adds custom HTTP headers to every request; used for OpenAI-compatible providers like github-copilot which requires Copilot-Integration-Id). dmr: runtime_flags. anthropic/amazon-bedrock (Claude): interleaved_thinking (boolean, default true), thinking_display ('summarized', 'omitted', or 'display') controls whether thinking blocks are returned in responses when thinking is enabled. Claude Opus 4.7 hides thinking by default ('omitted'); set thinking_display: summarized (or thinking_display: display) to receive thinking blocks. anthropic: fallbacks (array of model ID strings, in priority order; enables the server-side-fallback beta so requests refused by safety classifiers, e.g. on Claude Fable 5, are retried with the listed models in a single round trip). openai: transport ('sse' or 'websocket') to choose between SSE and WebSocket streaming for the Responses API. openai/anthropic/google: rerank_prompt (string) to fully override the system prompt used for RAG reranking (advanced - prefer using results.reranking.criteria for domain-specific guidance). Google: google_search (boolean) enables Google Search grounding, google_maps (boolean) enables Google Maps grounding, code_execution (boolean) enables server-side code execution.", + "description": "Provider-specific options. Sampling parameters: top_k (integer, supported by anthropic, google, amazon-bedrock, and custom OpenAI-compatible providers like vLLM/Ollama), repetition_penalty (float, forwarded to custom OpenAI-compatible providers), min_p (float, forwarded to custom providers), seed (integer, forwarded to OpenAI). Lifecycle: unload_api (string) overrides the unload endpoint inherited from the provider config (relative path resolved against base_url's scheme+host, or an absolute URL); used by the runtime's `unload` on_agent_switch builtin hook to release model resources between agent switches. Infrastructure options: http_headers (map of string to string, adds custom HTTP headers to every request; used for OpenAI-compatible providers like github-copilot which requires Copilot-Integration-Id). dmr: runtime_flags; supports_images (boolean) and supports_pdf (boolean) declare which document attachment types the local model accepts. DMR-hosted models are not in the models.dev catalog, so attachment capabilities cannot be detected automatically and default to text-only; set supports_images: true for a vision model so image attachments are forwarded instead of silently dropped. anthropic/amazon-bedrock (Claude): interleaved_thinking (boolean, default true), thinking_display ('summarized', 'omitted', or 'display') controls whether thinking blocks are returned in responses when thinking is enabled. Claude Opus 4.7 hides thinking by default ('omitted'); set thinking_display: summarized (or thinking_display: display) to receive thinking blocks. anthropic: fallbacks (array of model ID strings, in priority order; enables the server-side-fallback beta so requests refused by safety classifiers, e.g. on Claude Fable 5, are retried with the listed models in a single round trip). openai: transport ('sse' or 'websocket') to choose between SSE and WebSocket streaming for the Responses API. openai/anthropic/google: rerank_prompt (string) to fully override the system prompt used for RAG reranking (advanced - prefer using results.reranking.criteria for domain-specific guidance). Google: google_search (boolean) enables Google Search grounding, google_maps (boolean) enables Google Maps grounding, code_execution (boolean) enables server-side code execution.", "additionalProperties": true }, "track_usage": { diff --git a/examples/dmr.yaml b/examples/dmr.yaml index a4d112085..31fcfe2f3 100644 --- a/examples/dmr.yaml +++ b/examples/dmr.yaml @@ -23,3 +23,14 @@ models: speculative_draft_model: ai/qwen3:0.6B-Q4_K_M speculative_num_tokens: 16 # (this is the llama.cpp default if omitted) speculative_acceptance_rate: 0.8 # (this is the llama.cpp default if omitted) + + # A vision-capable local model. DMR-hosted models are not in the models.dev + # catalog, so docker-agent cannot detect multimodal support automatically and + # defaults to text-only (image/PDF attachments are dropped). Declare the + # capabilities explicitly so attachments are forwarded to the model. + qwen_vision: + provider: dmr + model: ai/qwen2.5-vl + provider_opts: + supports_images: true + supports_pdf: false diff --git a/pkg/model/provider/dmr/attachments_test.go b/pkg/model/provider/dmr/attachments_test.go new file mode 100644 index 000000000..e08c2ab43 --- /dev/null +++ b/pkg/model/provider/dmr/attachments_test.go @@ -0,0 +1,217 @@ +package dmr + +import ( + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/openai/openai-go/v3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/docker/docker-agent/pkg/chat" + "github.com/docker/docker-agent/pkg/config/latest" + "github.com/docker/docker-agent/pkg/modelinfo" +) + +// minPNG is a minimal PNG magic-byte header for use in tests. +var minPNG = []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A} + +// countParts counts content parts across all converted user messages for which +// pred returns true. +func countParts(msgs []openai.ChatCompletionMessageParamUnion, pred func(openai.ChatCompletionContentPartUnionParam) bool) int { + n := 0 + for _, m := range msgs { + if m.OfUser == nil { + continue + } + for _, p := range m.OfUser.Content.OfArrayOfContentParts { + if pred(p) { + n++ + } + } + } + return n +} + +func countImageParts(msgs []openai.ChatCompletionMessageParamUnion) int { + return countParts(msgs, func(p openai.ChatCompletionContentPartUnionParam) bool { return p.OfImageURL != nil }) +} + +func countFileParts(msgs []openai.ChatCompletionMessageParamUnion) int { + return countParts(msgs, func(p openai.ChatCompletionContentPartUnionParam) bool { return p.OfFile != nil }) +} + +// docMessage returns a single user message carrying one inline document attachment. +func docMessage(name, mime string, data []byte) []chat.Message { + return []chat.Message{{ + Role: chat.MessageRoleUser, + MultiContent: []chat.MessagePart{ + {Type: chat.MessagePartTypeText, Text: "Describe the attachment."}, + { + Type: chat.MessagePartTypeDocument, + Document: &chat.Document{ + Name: name, + MimeType: mime, + Source: chat.DocumentSource{InlineData: data}, + }, + }, + }, + }} +} + +// TestDMRConvertMessagesRespectsDeclaredCaps is the regression test for issue +// #2739: DMR-hosted models are absent from models.dev, so a store lookup always +// missed and image/PDF attachments were silently dropped. Capabilities are now +// declared via provider_opts and injected explicitly, so a declared capability +// forwards the attachment while the conservative default still drops it. +func TestDMRConvertMessagesRespectsDeclaredCaps(t *testing.T) { + t.Parallel() + + t.Run("image dropped by default (no caps declared)", func(t *testing.T) { + t.Parallel() + c := &Client{} // zero-value caps == text-only + msgs := c.convertMessages(t.Context(), docMessage("photo.png", "image/png", minPNG)) + assert.Equal(t, 0, countImageParts(msgs), "image must be dropped when no capability is declared") + }) + + t.Run("image forwarded when supports_images declared", func(t *testing.T) { + t.Parallel() + c := &Client{attachmentCaps: modelinfo.CapsWith(true, false)} + msgs := c.convertMessages(t.Context(), docMessage("photo.png", "image/png", minPNG)) + assert.Equal(t, 1, countImageParts(msgs), "image must be forwarded when supports_images is declared") + }) + + t.Run("pdf dropped by default (no caps declared)", func(t *testing.T) { + t.Parallel() + c := &Client{} + msgs := c.convertMessages(t.Context(), docMessage("spec.pdf", "application/pdf", []byte("%PDF-1.4"))) + assert.Equal(t, 0, countFileParts(msgs), "pdf must be dropped when no capability is declared") + }) + + t.Run("pdf forwarded as file part when supports_pdf declared", func(t *testing.T) { + t.Parallel() + c := &Client{attachmentCaps: modelinfo.CapsWith(false, true)} + msgs := c.convertMessages(t.Context(), docMessage("spec.pdf", "application/pdf", []byte("%PDF-1.4"))) + assert.Equal(t, 1, countFileParts(msgs), "pdf must be forwarded as a file part when supports_pdf is declared") + }) +} + +// TestParseDMRProviderOptsAttachmentCaps verifies that supports_images / +// supports_pdf are parsed (accepting bool and string forms) and that invalid +// values are rejected. +func TestParseDMRProviderOptsAttachmentCaps(t *testing.T) { + t.Parallel() + + t.Run("unset defaults to text-only", func(t *testing.T) { + t.Parallel() + res, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{}) + require.NoError(t, err) + assert.False(t, res.supportsImages) + assert.False(t, res.supportsPDF) + }) + + t.Run("bool values", func(t *testing.T) { + t.Parallel() + res, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{ + ProviderOpts: map[string]any{"supports_images": true, "supports_pdf": false}, + }) + require.NoError(t, err) + assert.True(t, res.supportsImages) + assert.False(t, res.supportsPDF) + }) + + t.Run("string values parse", func(t *testing.T) { + t.Parallel() + res, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{ + ProviderOpts: map[string]any{"supports_images": "true", "supports_pdf": "1"}, + }) + require.NoError(t, err) + assert.True(t, res.supportsImages) + assert.True(t, res.supportsPDF) + }) + + t.Run("invalid string rejected", func(t *testing.T) { + t.Parallel() + _, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{ + ProviderOpts: map[string]any{"supports_images": "yes-please"}, + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "supports_images") + }) + + t.Run("invalid type rejected", func(t *testing.T) { + t.Parallel() + _, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{ + ProviderOpts: map[string]any{"supports_pdf": 3}, + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "supports_pdf") + }) +} + +// TestDMRVisionAttachmentForwardedEndToEnd exercises the full client path: +// provider_opts -> attachmentCaps -> request body. The serialized chat +// completion request must carry the image as an image_url content part. +func TestDMRVisionAttachmentForwardedEndToEnd(t *testing.T) { + t.Parallel() + + var captured []byte + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.HasSuffix(r.URL.Path, "/chat/completions") { + body, _ := io.ReadAll(r.Body) + captured = body + w.Header().Set("Content-Type", "text/event-stream") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("data: {\"choices\":[{\"index\":0,\"delta\":{\"content\":\"ok\"}}]}\n\n")) + _, _ = w.Write([]byte("data: [DONE]\n\n")) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"data":[]}`)) + })) + defer server.Close() + + cfg := &latest.ModelConfig{ + Provider: "dmr", + Model: "ai/qwen2.5-vl", + BaseURL: server.URL + "/engines/v1/", + ProviderOpts: map[string]any{"supports_images": true}, + } + client, err := NewClient(t.Context(), cfg) + require.NoError(t, err) + + stream, err := client.CreateChatCompletionStream(t.Context(), docMessage("photo.png", "image/png", minPNG), nil) + require.NoError(t, err) + for { + if _, err := stream.Recv(); err != nil { + break + } + } + stream.Close() + + require.NotEmpty(t, captured, "chat/completions should have been called") + + var req struct { + Messages []struct { + Role string `json:"role"` + Content []struct { + Type string `json:"type"` + } `json:"content"` + } `json:"messages"` + } + require.NoError(t, json.Unmarshal(captured, &req)) + + imageParts := 0 + for _, m := range req.Messages { + for _, p := range m.Content { + if p.Type == "image_url" { + imageParts++ + } + } + } + assert.Equal(t, 1, imageParts, "request body must carry the image as an image_url content part") +} diff --git a/pkg/model/provider/dmr/client.go b/pkg/model/provider/dmr/client.go index 78af92622..992543de3 100644 --- a/pkg/model/provider/dmr/client.go +++ b/pkg/model/provider/dmr/client.go @@ -21,6 +21,7 @@ import ( "github.com/docker/docker-agent/pkg/model/provider/base" "github.com/docker/docker-agent/pkg/model/provider/oaistream" "github.com/docker/docker-agent/pkg/model/provider/options" + "github.com/docker/docker-agent/pkg/modelinfo" "github.com/docker/docker-agent/pkg/tools" ) @@ -55,6 +56,13 @@ type Client struct { client openai.Client httpClient *http.Client engine string + + // attachmentCaps records the document MIME types this DMR-hosted model is + // declared to accept natively, parsed from provider_opts.supports_images / + // supports_pdf. models.dev has no "dmr" provider, so capabilities cannot be + // detected there and must be declared explicitly; the zero value is + // text-only, matching the previous conservative behavior. + attachmentCaps modelinfo.ModelCapabilities } // NewClient creates a new DMR client from the provided configuration @@ -138,16 +146,21 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt ModelOptions: globalOptions, BaseURL: baseURL, }, - client: openai.NewClient(clientOptions...), - httpClient: httpClient, - engine: engine, + client: openai.NewClient(clientOptions...), + httpClient: httpClient, + engine: engine, + attachmentCaps: modelinfo.CapsWith(parsed.supportsImages, parsed.supportsPDF), }, nil } // convertMessages converts chat messages to OpenAI format and merges consecutive // system/user messages, which is needed by some local models run by DMR. +// +// Attachment capabilities are injected explicitly from provider_opts rather than +// resolved from models.dev: DMR is not a models.dev provider, so a store lookup +// would always miss and silently drop image/PDF attachments. func (c *Client) convertMessages(ctx context.Context, messages []chat.Message) []openai.ChatCompletionMessageParamUnion { - openaiMessages := oaistream.ConvertMessages(ctx, messages, c.ID(), c.ModelOptions.ModelsDevStore()) + openaiMessages := oaistream.ConvertMessagesWithCaps(ctx, messages, c.attachmentCaps) return oaistream.MergeConsecutiveMessages(openaiMessages) } diff --git a/pkg/model/provider/dmr/configure.go b/pkg/model/provider/dmr/configure.go index 0e7e03a84..8e37b45fb 100644 --- a/pkg/model/provider/dmr/configure.go +++ b/pkg/model/provider/dmr/configure.go @@ -211,6 +211,32 @@ func parseInt(v any) (int, bool) { return 0, false } +// parseBoolOpt extracts a boolean provider_opts value. It accepts a native +// bool or a string parseable by strconv.ParseBool ("true", "false", "1", "0", +// ...) and returns false when the key is absent. An unparseable or wrong-typed +// value is reported as an error so the caller can fail fast. +func parseBoolOpt(opts map[string]any, key string) (bool, error) { + if len(opts) == 0 { + return false, nil + } + v, ok := opts[key] + if !ok { + return false, nil + } + switch t := v.(type) { + case bool: + return t, nil + case string: + b, perr := strconv.ParseBool(strings.TrimSpace(t)) + if perr != nil { + return false, fmt.Errorf("provider_opts: %q must be a boolean, got %q", key, t) + } + return b, nil + default: + return false, fmt.Errorf("provider_opts: %q must be a boolean, got %T", key, v) + } +} + // parseInt64Value parses an int64 from YAML/JSON-decoded values (int, float64, string). func parseInt64Value(v any) (int64, bool) { switch t := v.(type) { @@ -354,6 +380,8 @@ type dmrParseResult struct { vllm *vllmConfig keepAlive *string mode *string + supportsImages bool + supportsPDF bool } // parseDMRProviderOpts extracts DMR-specific provider options from the model @@ -404,6 +432,18 @@ func parseDMRProviderOpts(engine string, cfg *latest.ModelConfig) (dmrParseResul res.rawRuntimeFlags = raw } + supportsImages, err := parseBoolOpt(cfg.ProviderOpts, "supports_images") + if err != nil { + return res, err + } + res.supportsImages = supportsImages + + supportsPDF, err := parseBoolOpt(cfg.ProviderOpts, "supports_pdf") + if err != nil { + return res, err + } + res.supportsPDF = supportsPDF + slog.Debug("DMR provider opts", "provider_opts", cfg.ProviderOpts, "engine", engine) if len(cfg.ProviderOpts) == 0 { diff --git a/pkg/model/provider/oaistream/attachments.go b/pkg/model/provider/oaistream/attachments.go index 20f9ff8ab..9bff3be0f 100644 --- a/pkg/model/provider/oaistream/attachments.go +++ b/pkg/model/provider/oaistream/attachments.go @@ -12,24 +12,17 @@ import ( "github.com/docker/docker-agent/pkg/attachment" "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/modelinfo" - "github.com/docker/docker-agent/pkg/modelsdev" ) -// convertDocument converts a chat.Document to zero or more +// convertDocumentWithCaps converts a chat.Document to zero or more // ChatCompletionContentPartUnionParam values using the OpenAI Chat Completions -// format. It uses the provided modelsdev.Store for capability lookups. +// format, given the model's resolved attachment capabilities. // // Routing: // - image/* with InlineData → data-URI image part // - other binary MIMEs with InlineData → drop (no native document block on Chat Completions) // - text MIMEs with InlineText → text part with TXTEnvelope // - unsupported / no content → nil (logged as warning) -func convertDocument(ctx context.Context, doc chat.Document, id modelsdev.ID, store *modelsdev.Store) ([]openai.ChatCompletionContentPartUnionParam, error) { - mc := modelinfo.LoadCaps(ctx, store, id) - return convertDocumentWithCaps(ctx, doc, mc) -} - -// convertDocumentWithCaps is the caps-injectable variant used by tests. func convertDocumentWithCaps(ctx context.Context, doc chat.Document, mc modelinfo.ModelCapabilities) ([]openai.ChatCompletionContentPartUnionParam, error) { strategy, reason := attachment.Decide(doc, mc) @@ -39,25 +32,33 @@ func convertDocumentWithCaps(ctx context.Context, doc chat.Document, mc modelinf return nil, nil case attachment.StrategyB64: - mime := strings.ToLower(doc.MimeType) - if strings.HasPrefix(mime, "image/") { - // Build an OpenAI image part with a data URI. - dataURI := fmt.Sprintf("data:%s;base64,%s", - doc.MimeType, - base64.StdEncoding.EncodeToString(doc.Source.InlineData)) + // The data URI is only built in the branches that use it, so an + // unsupported binary MIME is dropped without paying for base64 encoding. + switch mime := strings.ToLower(doc.MimeType); { + case strings.HasPrefix(mime, "image/"): return []openai.ChatCompletionContentPartUnionParam{ openai.ImageContentPart(openai.ChatCompletionContentPartImageImageURLParam{ - URL: dataURI, + URL: dataURI(doc.MimeType, doc.Source.InlineData), + }), + }, nil + case mime == "application/pdf": + // Chat Completions accepts PDFs as a `file` content part carrying a + // base64 data URI. Decide only routes a PDF here when the model + // declares PDF support, so models that cannot read PDFs are dropped + // upstream and never reach this branch. + return []openai.ChatCompletionContentPartUnionParam{ + openai.FileContentPart(openai.ChatCompletionContentPartFileFileParam{ + FileData: openai.String(dataURI(doc.MimeType, doc.Source.InlineData)), + Filename: openai.String(doc.Name), }), }, nil + default: + // Any other binary MIME with inline data has no native Chat + // Completions representation; drop and warn. + slog.WarnContext(ctx, "oaistream: unsupported binary attachment for Chat Completions endpoint — dropping", + "mime", doc.MimeType, "doc", doc.Name) + return nil, nil } - // application/pdf and other binary MIMEs: the Chat Completions endpoint has - // no native document block. Sending raw PDF bytes as base64-in-TXT-envelope - // is wasteful and meaningless. Drop and warn so the caller can handle it. - slog.WarnContext(ctx, "oaistream: PDF attachments are not supported on this provider's "+ - "Chat Completions endpoint — dropping", - "mime", doc.MimeType, "doc", doc.Name) - return nil, nil case attachment.StrategyTXT: envelope := attachment.TXTEnvelope(doc.Name, doc.MimeType, doc.Source.InlineText) @@ -69,3 +70,8 @@ func convertDocumentWithCaps(ctx context.Context, doc chat.Document, mc modelinf return nil, fmt.Errorf("unknown attachment strategy %d", strategy) } } + +// dataURI builds an RFC 2397 base64 data URI for the given MIME type and bytes. +func dataURI(mimeType string, data []byte) string { + return fmt.Sprintf("data:%s;base64,%s", mimeType, base64.StdEncoding.EncodeToString(data)) +} diff --git a/pkg/model/provider/oaistream/attachments_test.go b/pkg/model/provider/oaistream/attachments_test.go index c6f8ab328..09ce6d57d 100644 --- a/pkg/model/provider/oaistream/attachments_test.go +++ b/pkg/model/provider/oaistream/attachments_test.go @@ -39,6 +39,97 @@ func TestConvertDocument_StrategyB64_Image(t *testing.T) { assert.Contains(t, parts[0].OfImageURL.ImageURL.URL, wantB64) } +// TestConvertDocument_StrategyB64_PDF verifies that a PDF document with +// InlineData and a PDF-capable model produces a `file` content part carrying a +// base64 data URI, rather than being dropped. +func TestConvertDocument_StrategyB64_PDF(t *testing.T) { + pdf := []byte("%PDF-1.4 minimal") + doc := chat.Document{ + Name: "spec.pdf", + MimeType: "application/pdf", + Source: chat.DocumentSource{InlineData: pdf}, + } + + pdfCaps := modelinfo.CapsWith(false, true) + parts, err := convertDocumentWithCaps(t.Context(), doc, pdfCaps) + require.NoError(t, err) + require.Len(t, parts, 1, "expected exactly one file part") + require.NotNil(t, parts[0].OfFile, "expected file part for PDF, got non-file") + assert.Nil(t, parts[0].OfImageURL, "PDF must not be an image part") + + wantB64 := base64.StdEncoding.EncodeToString(pdf) + assert.Equal(t, "spec.pdf", parts[0].OfFile.File.Filename.Value) + assert.Contains(t, parts[0].OfFile.File.FileData.Value, "data:application/pdf;base64,") + assert.Contains(t, parts[0].OfFile.File.FileData.Value, wantB64) +} + +// TestConvertDocument_StrategyB64_PDFDropped verifies that a PDF is dropped when +// the model does not declare PDF support. +func TestConvertDocument_StrategyB64_PDFDropped(t *testing.T) { + doc := chat.Document{ + Name: "spec.pdf", + MimeType: "application/pdf", + Source: chat.DocumentSource{InlineData: []byte("%PDF-1.4")}, + } + + parts, err := convertDocumentWithCaps(t.Context(), doc, modelinfo.CapsWith(true, false)) + require.NoError(t, err) + assert.Nil(t, parts, "pdf should be dropped when the model does not support PDF") +} + +// TestConvertMessagesWithCaps verifies the caps-injectable message path: an +// image attachment is forwarded when the injected caps allow it and dropped +// otherwise, independent of any models.dev store. +func TestConvertMessagesWithCaps(t *testing.T) { + messages := []chat.Message{{ + Role: chat.MessageRoleUser, + MultiContent: []chat.MessagePart{{ + Type: chat.MessagePartTypeDocument, + Document: &chat.Document{ + Name: "photo.jpg", + MimeType: "image/jpeg", + Source: chat.DocumentSource{InlineData: minJPEG}, + }, + }}, + }} + + withVision := ConvertMessagesWithCaps(t.Context(), messages, modelinfo.CapsWith(true, false)) + require.Len(t, withVision, 1) + require.NotNil(t, withVision[0].OfUser) + require.Len(t, withVision[0].OfUser.Content.OfArrayOfContentParts, 1) + assert.NotNil(t, withVision[0].OfUser.Content.OfArrayOfContentParts[0].OfImageURL) + + textOnly := ConvertMessagesWithCaps(t.Context(), messages, modelinfo.ModelCapabilities{}) + require.Len(t, textOnly, 1) + require.NotNil(t, textOnly[0].OfUser) + assert.Empty(t, textOnly[0].OfUser.Content.OfArrayOfContentParts, "image should be dropped for text-only caps") + + // Symmetric PDF case: a PDF is forwarded as a file part when PDF caps are + // injected, and dropped otherwise. + pdfMessages := []chat.Message{{ + Role: chat.MessageRoleUser, + MultiContent: []chat.MessagePart{{ + Type: chat.MessagePartTypeDocument, + Document: &chat.Document{ + Name: "spec.pdf", + MimeType: "application/pdf", + Source: chat.DocumentSource{InlineData: []byte("%PDF-1.4")}, + }, + }}, + }} + + withPDF := ConvertMessagesWithCaps(t.Context(), pdfMessages, modelinfo.CapsWith(false, true)) + require.Len(t, withPDF, 1) + require.NotNil(t, withPDF[0].OfUser) + require.Len(t, withPDF[0].OfUser.Content.OfArrayOfContentParts, 1) + assert.NotNil(t, withPDF[0].OfUser.Content.OfArrayOfContentParts[0].OfFile) + + pdfDropped := ConvertMessagesWithCaps(t.Context(), pdfMessages, modelinfo.ModelCapabilities{}) + require.Len(t, pdfDropped, 1) + require.NotNil(t, pdfDropped[0].OfUser) + assert.Empty(t, pdfDropped[0].OfUser.Content.OfArrayOfContentParts, "pdf should be dropped for text-only caps") +} + // TestConvertDocument_StrategyB64_ImageDropped verifies that an image is // dropped when the model does not support vision. func TestConvertDocument_StrategyB64_ImageDropped(t *testing.T) { diff --git a/pkg/model/provider/oaistream/messages.go b/pkg/model/provider/oaistream/messages.go index f3fddff8c..aa14dba54 100644 --- a/pkg/model/provider/oaistream/messages.go +++ b/pkg/model/provider/oaistream/messages.go @@ -14,6 +14,7 @@ import ( "github.com/openai/openai-go/v3/packages/param" "github.com/docker/docker-agent/pkg/chat" + "github.com/docker/docker-agent/pkg/modelinfo" "github.com/docker/docker-agent/pkg/modelsdev" ) @@ -27,18 +28,26 @@ func (j JSONSchema) MarshalJSON() ([]byte, error) { } // ConvertMultiContent converts chat.MessagePart slices to OpenAI content -// parts using the provided modelsdev.Store for capability lookups. +// parts, resolving attachment capabilities from the provided modelsdev.Store. func ConvertMultiContent(ctx context.Context, multiContent []chat.MessagePart, id modelsdev.ID, store *modelsdev.Store) []openai.ChatCompletionContentPartUnionParam { - return convertMultiContentWithStore(ctx, multiContent, id, store) + return convertMultiContentWithCaps(ctx, multiContent, modelinfo.LoadCaps(ctx, store, id)) } -// ConvertMessages converts chat.Message slices to OpenAI message params -// using the provided modelsdev.Store for capability lookups. +// ConvertMessages converts chat.Message slices to OpenAI message params, +// resolving attachment capabilities from the provided modelsdev.Store. func ConvertMessages(ctx context.Context, messages []chat.Message, id modelsdev.ID, store *modelsdev.Store) []openai.ChatCompletionMessageParamUnion { - return convertMessagesWithStore(ctx, messages, id, store) + return convertMessagesWithCaps(ctx, messages, modelinfo.LoadCaps(ctx, store, id)) } -func convertMultiContentWithStore(ctx context.Context, multiContent []chat.MessagePart, id modelsdev.ID, store *modelsdev.Store) []openai.ChatCompletionContentPartUnionParam { +// ConvertMessagesWithCaps is the caps-injectable variant of [ConvertMessages]. +// It is used by providers whose models are not in the models.dev catalog (e.g. +// Docker Model Runner), which must supply attachment capabilities explicitly +// rather than resolving them from a store. +func ConvertMessagesWithCaps(ctx context.Context, messages []chat.Message, mc modelinfo.ModelCapabilities) []openai.ChatCompletionMessageParamUnion { + return convertMessagesWithCaps(ctx, messages, mc) +} + +func convertMultiContentWithCaps(ctx context.Context, multiContent []chat.MessagePart, mc modelinfo.ModelCapabilities) []openai.ChatCompletionContentPartUnionParam { parts := make([]openai.ChatCompletionContentPartUnionParam, 0, len(multiContent)) for _, part := range multiContent { switch part.Type { @@ -54,7 +63,7 @@ func convertMultiContentWithStore(ctx context.Context, multiContent []chat.Messa } case chat.MessagePartTypeDocument: if part.Document != nil { - docParts, err := convertDocument(ctx, *part.Document, id, store) + docParts, err := convertDocumentWithCaps(ctx, *part.Document, mc) if err != nil { slog.WarnContext(ctx, "failed to convert document attachment", "error", err, "doc", part.Document.Name) continue @@ -66,7 +75,7 @@ func convertMultiContentWithStore(ctx context.Context, multiContent []chat.Messa return parts } -func convertMessagesWithStore(ctx context.Context, messages []chat.Message, id modelsdev.ID, store *modelsdev.Store) []openai.ChatCompletionMessageParamUnion { +func convertMessagesWithCaps(ctx context.Context, messages []chat.Message, mc modelinfo.ModelCapabilities) []openai.ChatCompletionMessageParamUnion { openaiMessages := make([]openai.ChatCompletionMessageParamUnion, 0, len(messages)) for i := range messages { msg := &messages[i] @@ -99,7 +108,7 @@ func convertMessagesWithStore(ctx context.Context, messages []chat.Message, id m if len(msg.MultiContent) == 0 { openaiMessage = openai.UserMessage(msg.Content) } else { - openaiMessage = openai.UserMessage(convertMultiContentWithStore(ctx, msg.MultiContent, id, store)) + openaiMessage = openai.UserMessage(convertMultiContentWithCaps(ctx, msg.MultiContent, mc)) } case chat.MessageRoleAssistant: @@ -189,7 +198,7 @@ func convertMessagesWithStore(ctx context.Context, messages []chat.Message, id m } case chat.MessagePartTypeDocument: if part.Document != nil { - docParts, err := convertDocument(ctx, *part.Document, id, store) + docParts, err := convertDocumentWithCaps(ctx, *part.Document, mc) if err != nil { slog.WarnContext(ctx, "failed to convert tool result document attachment", "error", err, "doc", part.Document.Name) continue