Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion agent-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -1422,7 +1422,7 @@
},
"provider_opts": {
"type": "object",
"description": "Provider-specific options. Sampling parameters: top_k (integer, supported by anthropic, google, amazon-bedrock, and custom OpenAI-compatible providers like vLLM/Ollama), repetition_penalty (float, forwarded to custom OpenAI-compatible providers), min_p (float, forwarded to custom providers), seed (integer, forwarded to OpenAI). Lifecycle: unload_api (string) overrides the unload endpoint inherited from the provider config (relative path resolved against base_url's scheme+host, or an absolute URL); used by the runtime's `unload` on_agent_switch builtin hook to release model resources between agent switches. Infrastructure options: http_headers (map of string to string, adds custom HTTP headers to every request; used for OpenAI-compatible providers like github-copilot which requires Copilot-Integration-Id). dmr: runtime_flags. anthropic/amazon-bedrock (Claude): interleaved_thinking (boolean, default true), thinking_display ('summarized', 'omitted', or 'display') controls whether thinking blocks are returned in responses when thinking is enabled. Claude Opus 4.7 hides thinking by default ('omitted'); set thinking_display: summarized (or thinking_display: display) to receive thinking blocks. anthropic: fallbacks (array of model ID strings, in priority order; enables the server-side-fallback beta so requests refused by safety classifiers, e.g. on Claude Fable 5, are retried with the listed models in a single round trip). openai: transport ('sse' or 'websocket') to choose between SSE and WebSocket streaming for the Responses API. openai/anthropic/google: rerank_prompt (string) to fully override the system prompt used for RAG reranking (advanced - prefer using results.reranking.criteria for domain-specific guidance). Google: google_search (boolean) enables Google Search grounding, google_maps (boolean) enables Google Maps grounding, code_execution (boolean) enables server-side code execution.",
"description": "Provider-specific options. Sampling parameters: top_k (integer, supported by anthropic, google, amazon-bedrock, and custom OpenAI-compatible providers like vLLM/Ollama), repetition_penalty (float, forwarded to custom OpenAI-compatible providers), min_p (float, forwarded to custom providers), seed (integer, forwarded to OpenAI). Lifecycle: unload_api (string) overrides the unload endpoint inherited from the provider config (relative path resolved against base_url's scheme+host, or an absolute URL); used by the runtime's `unload` on_agent_switch builtin hook to release model resources between agent switches. Infrastructure options: http_headers (map of string to string, adds custom HTTP headers to every request; used for OpenAI-compatible providers like github-copilot which requires Copilot-Integration-Id). dmr: runtime_flags; supports_images (boolean) and supports_pdf (boolean) declare which document attachment types the local model accepts. DMR-hosted models are not in the models.dev catalog, so attachment capabilities cannot be detected automatically and default to text-only; set supports_images: true for a vision model so image attachments are forwarded instead of silently dropped. anthropic/amazon-bedrock (Claude): interleaved_thinking (boolean, default true), thinking_display ('summarized', 'omitted', or 'display') controls whether thinking blocks are returned in responses when thinking is enabled. Claude Opus 4.7 hides thinking by default ('omitted'); set thinking_display: summarized (or thinking_display: display) to receive thinking blocks. anthropic: fallbacks (array of model ID strings, in priority order; enables the server-side-fallback beta so requests refused by safety classifiers, e.g. on Claude Fable 5, are retried with the listed models in a single round trip). openai: transport ('sse' or 'websocket') to choose between SSE and WebSocket streaming for the Responses API. openai/anthropic/google: rerank_prompt (string) to fully override the system prompt used for RAG reranking (advanced - prefer using results.reranking.criteria for domain-specific guidance). Google: google_search (boolean) enables Google Search grounding, google_maps (boolean) enables Google Maps grounding, code_execution (boolean) enables server-side code execution.",
"additionalProperties": true
},
"track_usage": {
Expand Down
11 changes: 11 additions & 0 deletions examples/dmr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,14 @@ models:
speculative_draft_model: ai/qwen3:0.6B-Q4_K_M
speculative_num_tokens: 16 # (this is the llama.cpp default if omitted)
speculative_acceptance_rate: 0.8 # (this is the llama.cpp default if omitted)

# A vision-capable local model. DMR-hosted models are not in the models.dev
# catalog, so docker-agent cannot detect multimodal support automatically and
# defaults to text-only (image/PDF attachments are dropped). Declare the
# capabilities explicitly so attachments are forwarded to the model.
qwen_vision:
provider: dmr
model: ai/qwen2.5-vl
provider_opts:
supports_images: true
supports_pdf: false
217 changes: 217 additions & 0 deletions pkg/model/provider/dmr/attachments_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
package dmr

import (
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"strings"
"testing"

"github.com/openai/openai-go/v3"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/docker/docker-agent/pkg/chat"
"github.com/docker/docker-agent/pkg/config/latest"
"github.com/docker/docker-agent/pkg/modelinfo"
)

// minPNG is a minimal PNG magic-byte header for use in tests.
var minPNG = []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}

// countParts counts content parts across all converted user messages for which
// pred returns true.
func countParts(msgs []openai.ChatCompletionMessageParamUnion, pred func(openai.ChatCompletionContentPartUnionParam) bool) int {
n := 0
for _, m := range msgs {
if m.OfUser == nil {
continue
}
for _, p := range m.OfUser.Content.OfArrayOfContentParts {
if pred(p) {
n++
}
}
}
return n
}

func countImageParts(msgs []openai.ChatCompletionMessageParamUnion) int {
return countParts(msgs, func(p openai.ChatCompletionContentPartUnionParam) bool { return p.OfImageURL != nil })
}

func countFileParts(msgs []openai.ChatCompletionMessageParamUnion) int {
return countParts(msgs, func(p openai.ChatCompletionContentPartUnionParam) bool { return p.OfFile != nil })
}

// docMessage returns a single user message carrying one inline document attachment.
func docMessage(name, mime string, data []byte) []chat.Message {
return []chat.Message{{
Role: chat.MessageRoleUser,
MultiContent: []chat.MessagePart{
{Type: chat.MessagePartTypeText, Text: "Describe the attachment."},
{
Type: chat.MessagePartTypeDocument,
Document: &chat.Document{
Name: name,
MimeType: mime,
Source: chat.DocumentSource{InlineData: data},
},
},
},
}}
}

// TestDMRConvertMessagesRespectsDeclaredCaps is the regression test for issue
// #2739: DMR-hosted models are absent from models.dev, so a store lookup always
// missed and image/PDF attachments were silently dropped. Capabilities are now
// declared via provider_opts and injected explicitly, so a declared capability
// forwards the attachment while the conservative default still drops it.
func TestDMRConvertMessagesRespectsDeclaredCaps(t *testing.T) {
t.Parallel()

t.Run("image dropped by default (no caps declared)", func(t *testing.T) {
t.Parallel()
c := &Client{} // zero-value caps == text-only
msgs := c.convertMessages(t.Context(), docMessage("photo.png", "image/png", minPNG))
assert.Equal(t, 0, countImageParts(msgs), "image must be dropped when no capability is declared")
})

t.Run("image forwarded when supports_images declared", func(t *testing.T) {
t.Parallel()
c := &Client{attachmentCaps: modelinfo.CapsWith(true, false)}
msgs := c.convertMessages(t.Context(), docMessage("photo.png", "image/png", minPNG))
assert.Equal(t, 1, countImageParts(msgs), "image must be forwarded when supports_images is declared")
})

t.Run("pdf dropped by default (no caps declared)", func(t *testing.T) {
t.Parallel()
c := &Client{}
msgs := c.convertMessages(t.Context(), docMessage("spec.pdf", "application/pdf", []byte("%PDF-1.4")))
assert.Equal(t, 0, countFileParts(msgs), "pdf must be dropped when no capability is declared")
})

t.Run("pdf forwarded as file part when supports_pdf declared", func(t *testing.T) {
t.Parallel()
c := &Client{attachmentCaps: modelinfo.CapsWith(false, true)}
msgs := c.convertMessages(t.Context(), docMessage("spec.pdf", "application/pdf", []byte("%PDF-1.4")))
assert.Equal(t, 1, countFileParts(msgs), "pdf must be forwarded as a file part when supports_pdf is declared")
})
}

// TestParseDMRProviderOptsAttachmentCaps verifies that supports_images /
// supports_pdf are parsed (accepting bool and string forms) and that invalid
// values are rejected.
func TestParseDMRProviderOptsAttachmentCaps(t *testing.T) {
t.Parallel()

t.Run("unset defaults to text-only", func(t *testing.T) {
t.Parallel()
res, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{})
require.NoError(t, err)
assert.False(t, res.supportsImages)
assert.False(t, res.supportsPDF)
})

t.Run("bool values", func(t *testing.T) {
t.Parallel()
res, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{
ProviderOpts: map[string]any{"supports_images": true, "supports_pdf": false},
})
require.NoError(t, err)
assert.True(t, res.supportsImages)
assert.False(t, res.supportsPDF)
})

t.Run("string values parse", func(t *testing.T) {
t.Parallel()
res, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{
ProviderOpts: map[string]any{"supports_images": "true", "supports_pdf": "1"},
})
require.NoError(t, err)
assert.True(t, res.supportsImages)
assert.True(t, res.supportsPDF)
})

t.Run("invalid string rejected", func(t *testing.T) {
t.Parallel()
_, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{
ProviderOpts: map[string]any{"supports_images": "yes-please"},
})
require.Error(t, err)
assert.Contains(t, err.Error(), "supports_images")
})

t.Run("invalid type rejected", func(t *testing.T) {
t.Parallel()
_, err := parseDMRProviderOpts("llama.cpp", &latest.ModelConfig{
ProviderOpts: map[string]any{"supports_pdf": 3},
})
require.Error(t, err)
assert.Contains(t, err.Error(), "supports_pdf")
})
}

// TestDMRVisionAttachmentForwardedEndToEnd exercises the full client path:
// provider_opts -> attachmentCaps -> request body. The serialized chat
// completion request must carry the image as an image_url content part.
func TestDMRVisionAttachmentForwardedEndToEnd(t *testing.T) {
t.Parallel()

var captured []byte
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasSuffix(r.URL.Path, "/chat/completions") {
body, _ := io.ReadAll(r.Body)
captured = body
w.Header().Set("Content-Type", "text/event-stream")
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("data: {\"choices\":[{\"index\":0,\"delta\":{\"content\":\"ok\"}}]}\n\n"))
_, _ = w.Write([]byte("data: [DONE]\n\n"))
return
}
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"data":[]}`))
}))
defer server.Close()

cfg := &latest.ModelConfig{
Provider: "dmr",
Model: "ai/qwen2.5-vl",
BaseURL: server.URL + "/engines/v1/",
ProviderOpts: map[string]any{"supports_images": true},
}
client, err := NewClient(t.Context(), cfg)
require.NoError(t, err)

stream, err := client.CreateChatCompletionStream(t.Context(), docMessage("photo.png", "image/png", minPNG), nil)
require.NoError(t, err)
for {
if _, err := stream.Recv(); err != nil {
break
}
}
stream.Close()

require.NotEmpty(t, captured, "chat/completions should have been called")

var req struct {
Messages []struct {
Role string `json:"role"`
Content []struct {
Type string `json:"type"`
} `json:"content"`
} `json:"messages"`
}
require.NoError(t, json.Unmarshal(captured, &req))

imageParts := 0
for _, m := range req.Messages {
for _, p := range m.Content {
if p.Type == "image_url" {
imageParts++
}
}
}
assert.Equal(t, 1, imageParts, "request body must carry the image as an image_url content part")
}
21 changes: 17 additions & 4 deletions pkg/model/provider/dmr/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/docker/docker-agent/pkg/model/provider/base"
"github.com/docker/docker-agent/pkg/model/provider/oaistream"
"github.com/docker/docker-agent/pkg/model/provider/options"
"github.com/docker/docker-agent/pkg/modelinfo"
"github.com/docker/docker-agent/pkg/tools"
)

Expand Down Expand Up @@ -55,6 +56,13 @@ type Client struct {
client openai.Client
httpClient *http.Client
engine string

// attachmentCaps records the document MIME types this DMR-hosted model is
// declared to accept natively, parsed from provider_opts.supports_images /
// supports_pdf. models.dev has no "dmr" provider, so capabilities cannot be
// detected there and must be declared explicitly; the zero value is
// text-only, matching the previous conservative behavior.
attachmentCaps modelinfo.ModelCapabilities
}

// NewClient creates a new DMR client from the provided configuration
Expand Down Expand Up @@ -138,16 +146,21 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
ModelOptions: globalOptions,
BaseURL: baseURL,
},
client: openai.NewClient(clientOptions...),
httpClient: httpClient,
engine: engine,
client: openai.NewClient(clientOptions...),
httpClient: httpClient,
engine: engine,
attachmentCaps: modelinfo.CapsWith(parsed.supportsImages, parsed.supportsPDF),
}, nil
}

// convertMessages converts chat messages to OpenAI format and merges consecutive
// system/user messages, which is needed by some local models run by DMR.
//
// Attachment capabilities are injected explicitly from provider_opts rather than
// resolved from models.dev: DMR is not a models.dev provider, so a store lookup
// would always miss and silently drop image/PDF attachments.
func (c *Client) convertMessages(ctx context.Context, messages []chat.Message) []openai.ChatCompletionMessageParamUnion {
openaiMessages := oaistream.ConvertMessages(ctx, messages, c.ID(), c.ModelOptions.ModelsDevStore())
openaiMessages := oaistream.ConvertMessagesWithCaps(ctx, messages, c.attachmentCaps)
return oaistream.MergeConsecutiveMessages(openaiMessages)
}

Expand Down
40 changes: 40 additions & 0 deletions pkg/model/provider/dmr/configure.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,32 @@ func parseInt(v any) (int, bool) {
return 0, false
}

// parseBoolOpt extracts a boolean provider_opts value. It accepts a native
// bool or a string parseable by strconv.ParseBool ("true", "false", "1", "0",
// ...) and returns false when the key is absent. An unparseable or wrong-typed
// value is reported as an error so the caller can fail fast.
func parseBoolOpt(opts map[string]any, key string) (bool, error) {
if len(opts) == 0 {
return false, nil
}
v, ok := opts[key]
if !ok {
return false, nil
}
switch t := v.(type) {
case bool:
return t, nil
case string:
b, perr := strconv.ParseBool(strings.TrimSpace(t))
if perr != nil {
return false, fmt.Errorf("provider_opts: %q must be a boolean, got %q", key, t)
}
return b, nil
default:
return false, fmt.Errorf("provider_opts: %q must be a boolean, got %T", key, v)
}
}

// parseInt64Value parses an int64 from YAML/JSON-decoded values (int, float64, string).
func parseInt64Value(v any) (int64, bool) {
switch t := v.(type) {
Expand Down Expand Up @@ -354,6 +380,8 @@ type dmrParseResult struct {
vllm *vllmConfig
keepAlive *string
mode *string
supportsImages bool
supportsPDF bool
}

// parseDMRProviderOpts extracts DMR-specific provider options from the model
Expand Down Expand Up @@ -404,6 +432,18 @@ func parseDMRProviderOpts(engine string, cfg *latest.ModelConfig) (dmrParseResul
res.rawRuntimeFlags = raw
}

supportsImages, err := parseBoolOpt(cfg.ProviderOpts, "supports_images")
if err != nil {
return res, err
}
res.supportsImages = supportsImages

supportsPDF, err := parseBoolOpt(cfg.ProviderOpts, "supports_pdf")
if err != nil {
return res, err
}
res.supportsPDF = supportsPDF

slog.Debug("DMR provider opts", "provider_opts", cfg.ProviderOpts, "engine", engine)

if len(cfg.ProviderOpts) == 0 {
Expand Down
Loading
Loading