diff --git a/agent-schema.json b/agent-schema.json index 86fbaa22a..932677fdd 100644 --- a/agent-schema.json +++ b/agent-schema.json @@ -1422,7 +1422,7 @@ }, "provider_opts": { "type": "object", - "description": "Provider-specific options. Sampling parameters: top_k (integer, supported by anthropic, google, amazon-bedrock, and custom OpenAI-compatible providers like vLLM/Ollama), repetition_penalty (float, forwarded to custom OpenAI-compatible providers), min_p (float, forwarded to custom providers), seed (integer, forwarded to OpenAI). Lifecycle: unload_api (string) overrides the unload endpoint inherited from the provider config (relative path resolved against base_url's scheme+host, or an absolute URL); used by the runtime's `unload` on_agent_switch builtin hook to release model resources between agent switches. Infrastructure options: http_headers (map of string to string, adds custom HTTP headers to every request; used for OpenAI-compatible providers like github-copilot which requires Copilot-Integration-Id). dmr: runtime_flags. anthropic/amazon-bedrock (Claude): interleaved_thinking (boolean, default true), thinking_display ('summarized', 'omitted', or 'display') controls whether thinking blocks are returned in responses when thinking is enabled. Claude Opus 4.7 hides thinking by default ('omitted'); set thinking_display: summarized (or thinking_display: display) to receive thinking blocks. anthropic: fallbacks (array of model ID strings, in priority order; enables the server-side-fallback beta so requests refused by safety classifiers, e.g. on Claude Fable 5, are retried with the listed models in a single round trip). openai: transport ('sse' or 'websocket') to choose between SSE and WebSocket streaming for the Responses API. openai/anthropic/google: rerank_prompt (string) to fully override the system prompt used for RAG reranking (advanced - prefer using results.reranking.criteria for domain-specific guidance). Google: google_search (boolean) enables Google Search grounding, google_maps (boolean) enables Google Maps grounding, code_execution (boolean) enables server-side code execution.", + "description": "Provider-specific options. Sampling parameters: top_k (integer, supported by anthropic, google, amazon-bedrock, and custom OpenAI-compatible providers like vLLM/Ollama), repetition_penalty (float, forwarded to custom OpenAI-compatible providers), min_p (float, forwarded to custom providers), seed (integer, forwarded to OpenAI). Lifecycle: unload_api (string) overrides the unload endpoint inherited from the provider config (relative path resolved against base_url's scheme+host, or an absolute URL); used by the runtime's `unload` on_agent_switch builtin hook to release model resources between agent switches. Infrastructure options: http_headers (map of string to string, adds custom HTTP headers to every request; used for OpenAI-compatible providers like github-copilot which requires Copilot-Integration-Id). dmr: runtime_flags; supports_images (boolean) and supports_pdf (boolean) declare which document attachment types the local model accepts. DMR-hosted models are not in the models.dev catalog, so attachment capabilities cannot be detected automatically and default to text-only; set supports_images: true for a vision model so image attachments are forwarded instead of silently dropped. anthropic/amazon-bedrock (Claude): interleaved_thinking (boolean, default true), thinking_display ('summarized', 'omitted', or 'display') controls whether thinking blocks are returned in responses when thinking is enabled. Claude Opus 4.7 hides thinking by default ('omitted'); set thinking_display: summarized (or thinking_display: display) to receive thinking blocks. anthropic: fallbacks (array of model ID strings, in priority order; enables the server-side-fallback beta so requests refused by safety classifiers, e.g. on Claude Fable 5, are retried with the listed models in a single round trip). openai: transport ('sse' or 'websocket') to choose between SSE and WebSocket streaming for the Responses API. openai/anthropic/google: rerank_prompt (string) to fully override the system prompt used for RAG reranking (advanced - prefer using results.reranking.criteria for domain-specific guidance). Google: google_search (boolean) enables Google Search grounding, google_maps (boolean) enables Google Maps grounding, code_execution (boolean) enables server-side code execution.", "additionalProperties": true }, "track_usage": { diff --git a/examples/dmr.yaml b/examples/dmr.yaml index a4d112085..31fcfe2f3 100644 --- a/examples/dmr.yaml +++ b/examples/dmr.yaml @@ -23,3 +23,14 @@ models: speculative_draft_model: ai/qwen3:0.6B-Q4_K_M speculative_num_tokens: 16 # (this is the llama.cpp default if omitted) speculative_acceptance_rate: 0.8 # (this is the llama.cpp default if omitted) + + # A vision-capable local model. DMR-hosted models are not in the models.dev + # catalog, so docker-agent cannot detect multimodal support automatically and + # defaults to text-only (image/PDF attachments are dropped). Declare the + # capabilities explicitly so attachments are forwarded to the model. + qwen_vision: + provider: dmr + model: ai/qwen2.5-vl + provider_opts: + supports_images: true + supports_pdf: false diff --git a/pkg/model/provider/dmr/attachments_test.go b/pkg/model/provider/dmr/attachments_test.go new file mode 100644 index 000000000..e08c2ab43 --- /dev/null +++ b/pkg/model/provider/dmr/attachments_test.go @@ -0,0 +1,217 @@ +package dmr + +import ( + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/openai/openai-go/v3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/docker/docker-agent/pkg/chat" + "github.com/docker/docker-agent/pkg/config/latest" + "github.com/docker/docker-agent/pkg/modelinfo" +) + +// minPNG is a minimal PNG magic-byte header for use in tests. +var minPNG = []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A} + +// countParts counts content parts across all converted user messages for which +// pred returns true. +func countParts(msgs []openai.ChatCompletionMessageParamUnion, pred func(openai.ChatCompletionContentPartUnionParam) bool) int { + n := 0 + for _, m := range msgs { + if m.OfUser == nil { + continue + } + for _, p := range m.OfUser.Content.OfArrayOfContentParts { + if pred(p) { + n++ + } + } + } + return n +} + +func countImageParts(msgs []openai.ChatCompletionMessageParamUnion) int { + return countParts(msgs, func(p openai.ChatCompletionContentPartUnionParam) bool { return p.OfImageURL != nil }) +} + +func countFileParts(msgs []openai.ChatCompletionMessageParamUnion) int { + return countParts(msgs, func(p openai.ChatCompletionContentPartUnionParam) bool { return p.OfFile != nil }) +} + +// docMessage returns a single user message carrying one inline document attachment. +func docMessage(name, mime string, data []byte) []chat.Message { + return []chat.Message{{ + Role: chat.MessageRoleUser, + MultiContent: []chat.MessagePart{ + {Type: chat.MessagePartTypeText, Text: "Describe the attachment."}, + { + Type: chat.MessagePartTypeDocument, + Document: &chat.Document{ + Name: name, + MimeType: mime, + Source: chat.DocumentSource{InlineData: data}, + }, + }, + }, + }} +} + +// TestDMRConvertMessagesRespectsDeclaredCaps is the regression test for issue +// #2739: DMR-hosted models are absent from models.dev, so a store lookup always +// missed and image/PDF attachments were silently dropped. Capabilities are now +// declared via provider_opts and injected explicitly, so a declared capability +// forwards the attachment while the conservative default still drops it. +func TestDMRConvertMessagesRespectsDeclaredCaps(t *testing.T) { + t.Parallel() + + t.Run("image dropped by default (no caps declared)", func(t *testing.T) { + t.Parallel() + c := &Client{} // zero-value caps == text-only + msgs := c.convertMessages(t.Context(), docMessage("photo.png", "image/png", minPNG)) + assert.Equal(t, 0, countImageParts(msgs), "image must be dropped when no capability is declared") + }) + + t.Run("image forwarded when supports_images declared", func(t *testing.T) { + t.Parallel() + c := &Client{attachmentCaps: modelinfo.CapsWith(true, false)} + msgs := c.convertMessages(t.Context(), docMessage("photo.png", "image/png", minPNG)) + assert.Equal(t, 1, countImageParts(msgs), "image must be forwarded when supports_images is declared") + }) + + t.Run("pdf dropped by default (no caps declared)", func(t *testing.T) { + t.Parallel() + c := &Client{} + msgs := c.convertMessages(t.Context(), docMessage("spec.pdf", "application/pdf", []byte("%PDF-1.4"))) + assert.Equal(t, 0, countFileParts(msgs), "pdf must be dropped when no capability is declared") + }) + + t.Run("pdf forwarded as file part when supports_pdf declared", func(t *testing.T) { + t.Parallel() + c := &Client{attachmentCaps: modelinfo.CapsWith(false, true)} + msgs := c.convertMessages(t.Context(), docMessage("spec.pdf", "application/pdf", []byte("%PDF-1.4"))) + assert.Equal(t, 1, countFileParts(msgs), "pdf must be forwarded as a file part when supports_pdf is declared") + }) +} + +// TestParseDMRProviderOptsAttachmentCaps verifies that supports_images / +// supports_pdf are parsed (accepting bool and string forms) and that invalid +// values are rejected. +func TestParseDMRProviderOptsAttachmentCaps(t *testing.T) { + t.Parallel() + + t.Run("unset defaults to text-only", func(t *testing.T) { + t.Parallel() + res, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{}) + require.NoError(t, err) + assert.False(t, res.supportsImages) + assert.False(t, res.supportsPDF) + }) + + t.Run("bool values", func(t *testing.T) { + t.Parallel() + res, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{ + ProviderOpts: map[string]any{"supports_images": true, "supports_pdf": false}, + }) + require.NoError(t, err) + assert.True(t, res.supportsImages) + assert.False(t, res.supportsPDF) + }) + + t.Run("string values parse", func(t *testing.T) { + t.Parallel() + res, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{ + ProviderOpts: map[string]any{"supports_images": "true", "supports_pdf": "1"}, + }) + require.NoError(t, err) + assert.True(t, res.supportsImages) + assert.True(t, res.supportsPDF) + }) + + t.Run("invalid string rejected", func(t *testing.T) { + t.Parallel() + _, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{ + ProviderOpts: map[string]any{"supports_images": "yes-please"}, + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "supports_images") + }) + + t.Run("invalid type rejected", func(t *testing.T) { + t.Parallel() + _, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{ + ProviderOpts: map[string]any{"supports_pdf": 3}, + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "supports_pdf") + }) +} + +// TestDMRVisionAttachmentForwardedEndToEnd exercises the full client path: +// provider_opts -> attachmentCaps -> request body. The serialized chat +// completion request must carry the image as an image_url content part. +func TestDMRVisionAttachmentForwardedEndToEnd(t *testing.T) { + t.Parallel() + + var captured []byte + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.HasSuffix(r.URL.Path, "/chat/completions") { + body, _ := io.ReadAll(r.Body) + captured = body + w.Header().Set("Content-Type", "text/event-stream") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("data: {\"choices\":[{\"index\":0,\"delta\":{\"content\":\"ok\"}}]}\n\n")) + _, _ = w.Write([]byte("data: [DONE]\n\n")) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"data":[]}`)) + })) + defer server.Close() + + cfg := &latest.ModelConfig{ + Provider: "dmr", + Model: "ai/qwen2.5-vl", + BaseURL: server.URL + "/engines/v1/", + ProviderOpts: map[string]any{"supports_images": true}, + } + client, err := NewClient(t.Context(), cfg) + require.NoError(t, err) + + stream, err := client.CreateChatCompletionStream(t.Context(), docMessage("photo.png", "image/png", minPNG), nil) + require.NoError(t, err) + for { + if _, err := stream.Recv(); err != nil { + break + } + } + stream.Close() + + require.NotEmpty(t, captured, "chat/completions should have been called") + + var req struct { + Messages []struct { + Role string `json:"role"` + Content []struct { + Type string `json:"type"` + } `json:"content"` + } `json:"messages"` + } + require.NoError(t, json.Unmarshal(captured, &req)) + + imageParts := 0 + for _, m := range req.Messages { + for _, p := range m.Content { + if p.Type == "image_url" { + imageParts++ + } + } + } + assert.Equal(t, 1, imageParts, "request body must carry the image as an image_url content part") +} diff --git a/pkg/model/provider/dmr/client.go b/pkg/model/provider/dmr/client.go index 78af92622..992543de3 100644 --- a/pkg/model/provider/dmr/client.go +++ b/pkg/model/provider/dmr/client.go @@ -21,6 +21,7 @@ import ( "github.com/docker/docker-agent/pkg/model/provider/base" "github.com/docker/docker-agent/pkg/model/provider/oaistream" "github.com/docker/docker-agent/pkg/model/provider/options" + "github.com/docker/docker-agent/pkg/modelinfo" "github.com/docker/docker-agent/pkg/tools" ) @@ -55,6 +56,13 @@ type Client struct { client openai.Client httpClient *http.Client engine string + + // attachmentCaps records the document MIME types this DMR-hosted model is + // declared to accept natively, parsed from provider_opts.supports_images / + // supports_pdf. models.dev has no "dmr" provider, so capabilities cannot be + // detected there and must be declared explicitly; the zero value is + // text-only, matching the previous conservative behavior. + attachmentCaps modelinfo.ModelCapabilities } // NewClient creates a new DMR client from the provided configuration @@ -138,16 +146,21 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt ModelOptions: globalOptions, BaseURL: baseURL, }, - client: openai.NewClient(clientOptions...), - httpClient: httpClient, - engine: engine, + client: openai.NewClient(clientOptions...), + httpClient: httpClient, + engine: engine, + attachmentCaps: modelinfo.CapsWith(parsed.supportsImages, parsed.supportsPDF), }, nil } // convertMessages converts chat messages to OpenAI format and merges consecutive // system/user messages, which is needed by some local models run by DMR. +// +// Attachment capabilities are injected explicitly from provider_opts rather than +// resolved from models.dev: DMR is not a models.dev provider, so a store lookup +// would always miss and silently drop image/PDF attachments. func (c *Client) convertMessages(ctx context.Context, messages []chat.Message) []openai.ChatCompletionMessageParamUnion { - openaiMessages := oaistream.ConvertMessages(ctx, messages, c.ID(), c.ModelOptions.ModelsDevStore()) + openaiMessages := oaistream.ConvertMessagesWithCaps(ctx, messages, c.attachmentCaps) return oaistream.MergeConsecutiveMessages(openaiMessages) } diff --git a/pkg/model/provider/dmr/configure.go b/pkg/model/provider/dmr/configure.go index 0e7e03a84..8e37b45fb 100644 --- a/pkg/model/provider/dmr/configure.go +++ b/pkg/model/provider/dmr/configure.go @@ -211,6 +211,32 @@ func parseInt(v any) (int, bool) { return 0, false } +// parseBoolOpt extracts a boolean provider_opts value. It accepts a native +// bool or a string parseable by strconv.ParseBool ("true", "false", "1", "0", +// ...) and returns false when the key is absent. An unparseable or wrong-typed +// value is reported as an error so the caller can fail fast. +func parseBoolOpt(opts map[string]any, key string) (bool, error) { + if len(opts) == 0 { + return false, nil + } + v, ok := opts[key] + if !ok { + return false, nil + } + switch t := v.(type) { + case bool: + return t, nil + case string: + b, perr := strconv.ParseBool(strings.TrimSpace(t)) + if perr != nil { + return false, fmt.Errorf("provider_opts: %q must be a boolean, got %q", key, t) + } + return b, nil + default: + return false, fmt.Errorf("provider_opts: %q must be a boolean, got %T", key, v) + } +} + // parseInt64Value parses an int64 from YAML/JSON-decoded values (int, float64, string). func parseInt64Value(v any) (int64, bool) { switch t := v.(type) { @@ -354,6 +380,8 @@ type dmrParseResult struct { vllm *vllmConfig keepAlive *string mode *string + supportsImages bool + supportsPDF bool } // parseDMRProviderOpts extracts DMR-specific provider options from the model @@ -404,6 +432,18 @@ func parseDMRProviderOpts(engine string, cfg *latest.ModelConfig) (dmrParseResul res.rawRuntimeFlags = raw } + supportsImages, err := parseBoolOpt(cfg.ProviderOpts, "supports_images") + if err != nil { + return res, err + } + res.supportsImages = supportsImages + + supportsPDF, err := parseBoolOpt(cfg.ProviderOpts, "supports_pdf") + if err != nil { + return res, err + } + res.supportsPDF = supportsPDF + slog.Debug("DMR provider opts", "provider_opts", cfg.ProviderOpts, "engine", engine) if len(cfg.ProviderOpts) == 0 { diff --git a/pkg/model/provider/oaistream/attachments.go b/pkg/model/provider/oaistream/attachments.go index 20f9ff8ab..9bff3be0f 100644 --- a/pkg/model/provider/oaistream/attachments.go +++ b/pkg/model/provider/oaistream/attachments.go @@ -12,24 +12,17 @@ import ( "github.com/docker/docker-agent/pkg/attachment" "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/modelinfo" - "github.com/docker/docker-agent/pkg/modelsdev" ) -// convertDocument converts a chat.Document to zero or more +// convertDocumentWithCaps converts a chat.Document to zero or more // ChatCompletionContentPartUnionParam values using the OpenAI Chat Completions -// format. It uses the provided modelsdev.Store for capability lookups. +// format, given the model's resolved attachment capabilities. // // Routing: // - image/* with InlineData → data-URI image part // - other binary MIMEs with InlineData → drop (no native document block on Chat Completions) // - text MIMEs with InlineText → text part with TXTEnvelope // - unsupported / no content → nil (logged as warning) -func convertDocument(ctx context.Context, doc chat.Document, id modelsdev.ID, store *modelsdev.Store) ([]openai.ChatCompletionContentPartUnionParam, error) { - mc := modelinfo.LoadCaps(ctx, store, id) - return convertDocumentWithCaps(ctx, doc, mc) -} - -// convertDocumentWithCaps is the caps-injectable variant used by tests. func convertDocumentWithCaps(ctx context.Context, doc chat.Document, mc modelinfo.ModelCapabilities) ([]openai.ChatCompletionContentPartUnionParam, error) { strategy, reason := attachment.Decide(doc, mc) @@ -39,25 +32,33 @@ func convertDocumentWithCaps(ctx context.Context, doc chat.Document, mc modelinf return nil, nil case attachment.StrategyB64: - mime := strings.ToLower(doc.MimeType) - if strings.HasPrefix(mime, "image/") { - // Build an OpenAI image part with a data URI. - dataURI := fmt.Sprintf("data:%s;base64,%s", - doc.MimeType, - base64.StdEncoding.EncodeToString(doc.Source.InlineData)) + // The data URI is only built in the branches that use it, so an + // unsupported binary MIME is dropped without paying for base64 encoding. + switch mime := strings.ToLower(doc.MimeType); { + case strings.HasPrefix(mime, "image/"): return []openai.ChatCompletionContentPartUnionParam{ openai.ImageContentPart(openai.ChatCompletionContentPartImageImageURLParam{ - URL: dataURI, + URL: dataURI(doc.MimeType, doc.Source.InlineData), + }), + }, nil + case mime == "application/pdf": + // Chat Completions accepts PDFs as a `file` content part carrying a + // base64 data URI. Decide only routes a PDF here when the model + // declares PDF support, so models that cannot read PDFs are dropped + // upstream and never reach this branch. + return []openai.ChatCompletionContentPartUnionParam{ + openai.FileContentPart(openai.ChatCompletionContentPartFileFileParam{ + FileData: openai.String(dataURI(doc.MimeType, doc.Source.InlineData)), + Filename: openai.String(doc.Name), }), }, nil + default: + // Any other binary MIME with inline data has no native Chat + // Completions representation; drop and warn. + slog.WarnContext(ctx, "oaistream: unsupported binary attachment for Chat Completions endpoint — dropping", + "mime", doc.MimeType, "doc", doc.Name) + return nil, nil } - // application/pdf and other binary MIMEs: the Chat Completions endpoint has - // no native document block. Sending raw PDF bytes as base64-in-TXT-envelope - // is wasteful and meaningless. Drop and warn so the caller can handle it. - slog.WarnContext(ctx, "oaistream: PDF attachments are not supported on this provider's "+ - "Chat Completions endpoint — dropping", - "mime", doc.MimeType, "doc", doc.Name) - return nil, nil case attachment.StrategyTXT: envelope := attachment.TXTEnvelope(doc.Name, doc.MimeType, doc.Source.InlineText) @@ -69,3 +70,8 @@ func convertDocumentWithCaps(ctx context.Context, doc chat.Document, mc modelinf return nil, fmt.Errorf("unknown attachment strategy %d", strategy) } } + +// dataURI builds an RFC 2397 base64 data URI for the given MIME type and bytes. +func dataURI(mimeType string, data []byte) string { + return fmt.Sprintf("data:%s;base64,%s", mimeType, base64.StdEncoding.EncodeToString(data)) +} diff --git a/pkg/model/provider/oaistream/attachments_test.go b/pkg/model/provider/oaistream/attachments_test.go index c6f8ab328..09ce6d57d 100644 --- a/pkg/model/provider/oaistream/attachments_test.go +++ b/pkg/model/provider/oaistream/attachments_test.go @@ -39,6 +39,97 @@ func TestConvertDocument_StrategyB64_Image(t *testing.T) { assert.Contains(t, parts[0].OfImageURL.ImageURL.URL, wantB64) } +// TestConvertDocument_StrategyB64_PDF verifies that a PDF document with +// InlineData and a PDF-capable model produces a `file` content part carrying a +// base64 data URI, rather than being dropped. +func TestConvertDocument_StrategyB64_PDF(t *testing.T) { + pdf := []byte("%PDF-1.4 minimal") + doc := chat.Document{ + Name: "spec.pdf", + MimeType: "application/pdf", + Source: chat.DocumentSource{InlineData: pdf}, + } + + pdfCaps := modelinfo.CapsWith(false, true) + parts, err := convertDocumentWithCaps(t.Context(), doc, pdfCaps) + require.NoError(t, err) + require.Len(t, parts, 1, "expected exactly one file part") + require.NotNil(t, parts[0].OfFile, "expected file part for PDF, got non-file") + assert.Nil(t, parts[0].OfImageURL, "PDF must not be an image part") + + wantB64 := base64.StdEncoding.EncodeToString(pdf) + assert.Equal(t, "spec.pdf", parts[0].OfFile.File.Filename.Value) + assert.Contains(t, parts[0].OfFile.File.FileData.Value, "data:application/pdf;base64,") + assert.Contains(t, parts[0].OfFile.File.FileData.Value, wantB64) +} + +// TestConvertDocument_StrategyB64_PDFDropped verifies that a PDF is dropped when +// the model does not declare PDF support. +func TestConvertDocument_StrategyB64_PDFDropped(t *testing.T) { + doc := chat.Document{ + Name: "spec.pdf", + MimeType: "application/pdf", + Source: chat.DocumentSource{InlineData: []byte("%PDF-1.4")}, + } + + parts, err := convertDocumentWithCaps(t.Context(), doc, modelinfo.CapsWith(true, false)) + require.NoError(t, err) + assert.Nil(t, parts, "pdf should be dropped when the model does not support PDF") +} + +// TestConvertMessagesWithCaps verifies the caps-injectable message path: an +// image attachment is forwarded when the injected caps allow it and dropped +// otherwise, independent of any models.dev store. +func TestConvertMessagesWithCaps(t *testing.T) { + messages := []chat.Message{{ + Role: chat.MessageRoleUser, + MultiContent: []chat.MessagePart{{ + Type: chat.MessagePartTypeDocument, + Document: &chat.Document{ + Name: "photo.jpg", + MimeType: "image/jpeg", + Source: chat.DocumentSource{InlineData: minJPEG}, + }, + }}, + }} + + withVision := ConvertMessagesWithCaps(t.Context(), messages, modelinfo.CapsWith(true, false)) + require.Len(t, withVision, 1) + require.NotNil(t, withVision[0].OfUser) + require.Len(t, withVision[0].OfUser.Content.OfArrayOfContentParts, 1) + assert.NotNil(t, withVision[0].OfUser.Content.OfArrayOfContentParts[0].OfImageURL) + + textOnly := ConvertMessagesWithCaps(t.Context(), messages, modelinfo.ModelCapabilities{}) + require.Len(t, textOnly, 1) + require.NotNil(t, textOnly[0].OfUser) + assert.Empty(t, textOnly[0].OfUser.Content.OfArrayOfContentParts, "image should be dropped for text-only caps") + + // Symmetric PDF case: a PDF is forwarded as a file part when PDF caps are + // injected, and dropped otherwise. + pdfMessages := []chat.Message{{ + Role: chat.MessageRoleUser, + MultiContent: []chat.MessagePart{{ + Type: chat.MessagePartTypeDocument, + Document: &chat.Document{ + Name: "spec.pdf", + MimeType: "application/pdf", + Source: chat.DocumentSource{InlineData: []byte("%PDF-1.4")}, + }, + }}, + }} + + withPDF := ConvertMessagesWithCaps(t.Context(), pdfMessages, modelinfo.CapsWith(false, true)) + require.Len(t, withPDF, 1) + require.NotNil(t, withPDF[0].OfUser) + require.Len(t, withPDF[0].OfUser.Content.OfArrayOfContentParts, 1) + assert.NotNil(t, withPDF[0].OfUser.Content.OfArrayOfContentParts[0].OfFile) + + pdfDropped := ConvertMessagesWithCaps(t.Context(), pdfMessages, modelinfo.ModelCapabilities{}) + require.Len(t, pdfDropped, 1) + require.NotNil(t, pdfDropped[0].OfUser) + assert.Empty(t, pdfDropped[0].OfUser.Content.OfArrayOfContentParts, "pdf should be dropped for text-only caps") +} + // TestConvertDocument_StrategyB64_ImageDropped verifies that an image is // dropped when the model does not support vision. func TestConvertDocument_StrategyB64_ImageDropped(t *testing.T) { diff --git a/pkg/model/provider/oaistream/messages.go b/pkg/model/provider/oaistream/messages.go index f3fddff8c..aa14dba54 100644 --- a/pkg/model/provider/oaistream/messages.go +++ b/pkg/model/provider/oaistream/messages.go @@ -14,6 +14,7 @@ import ( "github.com/openai/openai-go/v3/packages/param" "github.com/docker/docker-agent/pkg/chat" + "github.com/docker/docker-agent/pkg/modelinfo" "github.com/docker/docker-agent/pkg/modelsdev" ) @@ -27,18 +28,26 @@ func (j JSONSchema) MarshalJSON() ([]byte, error) { } // ConvertMultiContent converts chat.MessagePart slices to OpenAI content -// parts using the provided modelsdev.Store for capability lookups. +// parts, resolving attachment capabilities from the provided modelsdev.Store. func ConvertMultiContent(ctx context.Context, multiContent []chat.MessagePart, id modelsdev.ID, store *modelsdev.Store) []openai.ChatCompletionContentPartUnionParam { - return convertMultiContentWithStore(ctx, multiContent, id, store) + return convertMultiContentWithCaps(ctx, multiContent, modelinfo.LoadCaps(ctx, store, id)) } -// ConvertMessages converts chat.Message slices to OpenAI message params -// using the provided modelsdev.Store for capability lookups. +// ConvertMessages converts chat.Message slices to OpenAI message params, +// resolving attachment capabilities from the provided modelsdev.Store. func ConvertMessages(ctx context.Context, messages []chat.Message, id modelsdev.ID, store *modelsdev.Store) []openai.ChatCompletionMessageParamUnion { - return convertMessagesWithStore(ctx, messages, id, store) + return convertMessagesWithCaps(ctx, messages, modelinfo.LoadCaps(ctx, store, id)) } -func convertMultiContentWithStore(ctx context.Context, multiContent []chat.MessagePart, id modelsdev.ID, store *modelsdev.Store) []openai.ChatCompletionContentPartUnionParam { +// ConvertMessagesWithCaps is the caps-injectable variant of [ConvertMessages]. +// It is used by providers whose models are not in the models.dev catalog (e.g. +// Docker Model Runner), which must supply attachment capabilities explicitly +// rather than resolving them from a store. +func ConvertMessagesWithCaps(ctx context.Context, messages []chat.Message, mc modelinfo.ModelCapabilities) []openai.ChatCompletionMessageParamUnion { + return convertMessagesWithCaps(ctx, messages, mc) +} + +func convertMultiContentWithCaps(ctx context.Context, multiContent []chat.MessagePart, mc modelinfo.ModelCapabilities) []openai.ChatCompletionContentPartUnionParam { parts := make([]openai.ChatCompletionContentPartUnionParam, 0, len(multiContent)) for _, part := range multiContent { switch part.Type { @@ -54,7 +63,7 @@ func convertMultiContentWithStore(ctx context.Context, multiContent []chat.Messa } case chat.MessagePartTypeDocument: if part.Document != nil { - docParts, err := convertDocument(ctx, *part.Document, id, store) + docParts, err := convertDocumentWithCaps(ctx, *part.Document, mc) if err != nil { slog.WarnContext(ctx, "failed to convert document attachment", "error", err, "doc", part.Document.Name) continue @@ -66,7 +75,7 @@ func convertMultiContentWithStore(ctx context.Context, multiContent []chat.Messa return parts } -func convertMessagesWithStore(ctx context.Context, messages []chat.Message, id modelsdev.ID, store *modelsdev.Store) []openai.ChatCompletionMessageParamUnion { +func convertMessagesWithCaps(ctx context.Context, messages []chat.Message, mc modelinfo.ModelCapabilities) []openai.ChatCompletionMessageParamUnion { openaiMessages := make([]openai.ChatCompletionMessageParamUnion, 0, len(messages)) for i := range messages { msg := &messages[i] @@ -99,7 +108,7 @@ func convertMessagesWithStore(ctx context.Context, messages []chat.Message, id m if len(msg.MultiContent) == 0 { openaiMessage = openai.UserMessage(msg.Content) } else { - openaiMessage = openai.UserMessage(convertMultiContentWithStore(ctx, msg.MultiContent, id, store)) + openaiMessage = openai.UserMessage(convertMultiContentWithCaps(ctx, msg.MultiContent, mc)) } case chat.MessageRoleAssistant: @@ -189,7 +198,7 @@ func convertMessagesWithStore(ctx context.Context, messages []chat.Message, id m } case chat.MessagePartTypeDocument: if part.Document != nil { - docParts, err := convertDocument(ctx, *part.Document, id, store) + docParts, err := convertDocumentWithCaps(ctx, *part.Document, mc) if err != nil { slog.WarnContext(ctx, "failed to convert tool result document attachment", "error", err, "doc", part.Document.Name) continue