From bbedccbe69f68eb737d50b0d8b3c81fc10885924 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 1 May 2026 11:24:02 -0700
Subject: [PATCH 1/8] chore: remove sandbox test scripts

---
 tests/sandbox/test_shape.swift | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 tests/sandbox/test_shape.swift

diff --git a/tests/sandbox/test_shape.swift b/tests/sandbox/test_shape.swift
deleted file mode 100644
index cb82d527..00000000
--- a/tests/sandbox/test_shape.swift
+++ /dev/null
@@ -1,12 +0,0 @@
-import Foundation
-import MLX
-
-let textEmbeds = MLXArray.zeros([1, 10, 4])
-let imageIndices = MLXArray([2, 3, 4])
-let imageFeatures = MLXArray.ones([1, 3, 4]) * 5.0
-
-var result = textEmbeds
-result[0..., imageIndices, 0...] = imageFeatures
-
-eval(result)
-print(result)

From a5bf26a006379dba04031699172622d24330ccb7 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 1 May 2026 11:24:02 -0700
Subject: [PATCH 2/8] test: add missing Context Window, Config Persistence, and
 Server unit tests

---
 .../ContextWindowCalculationTests.swift       |  30 ++
 .../GenerationConfigPersistenceTests.swift    | 311 +++++++++++++
 .../SwiftLMTests/SwiftBuddyServerTests.swift  | 413 ++++++++++++++++++
 3 files changed, 754 insertions(+)
 create mode 100644 tests/SwiftBuddyTests/ContextWindowCalculationTests.swift
 create mode 100644 tests/SwiftLMTests/GenerationConfigPersistenceTests.swift
 create mode 100644 tests/SwiftLMTests/SwiftBuddyServerTests.swift

diff --git a/tests/SwiftBuddyTests/ContextWindowCalculationTests.swift b/tests/SwiftBuddyTests/ContextWindowCalculationTests.swift
new file mode 100644
index 00000000..a3710f43
--- /dev/null
+++ b/tests/SwiftBuddyTests/ContextWindowCalculationTests.swift
@@ -0,0 +1,30 @@
+import XCTest
+import MLX
+import MLXLMCommon
+@testable import MLXInferenceCore
+
+final class ContextWindowCalculationTests: XCTestCase {
+
+    @MainActor
+    func testContextTokensCalculation() async throws {
+        // Feature: Verify that tokens calculation accurately reflects the prompt cache window
+        // by evaluating the full size of the prepared tokens array, not just the batch shape.
+        
+        let engine = InferenceEngine()
+        
+        // Mock a scenario where userInput prepares a chat template with large history.
+        // We will directly instantiate LMInput and assert on its size.
+        
+        let mockTokens = MLXArray(stride: 0, to: 512, by: 1)
+        // If tokenizer batches it, shape could be [1, 512].
+        let reshapedTokens = mockTokens.reshaped([1, 512])
+        
+        // MLXLMCommon's LMInput struct
+        let lmInput = LMInput(tokens: reshapedTokens)
+        
+        // Validate that using .size accurately captures the token count (512)
+        // rather than falling victim to the batch dimension .shape[0] which would be 1.
+        XCTAssertEqual(lmInput.text.tokens.shape[0], 1, "shape[0] captures the batch dimension, returning 1")
+        XCTAssertEqual(lmInput.text.tokens.size, 512, "size captures the total token count, resolving the context window bug")
+    }
+}
diff --git a/tests/SwiftLMTests/GenerationConfigPersistenceTests.swift b/tests/SwiftLMTests/GenerationConfigPersistenceTests.swift
new file mode 100644
index 00000000..97915313
--- /dev/null
+++ b/tests/SwiftLMTests/GenerationConfigPersistenceTests.swift
@@ -0,0 +1,311 @@
+// GenerationConfigPersistenceTests.swift — Regression tests for SwiftBuddy fixes
+//
+// Covers four independent fixes committed alongside Issue #97:
+//   1. GenerationConfig Codable + save/load persistence
+//   2. enable_thinking additionalContext wiring (thinking mode)
+//   3. /v1/chat/completions request parsing logic
+//   4. Server config propagation from persisted UserDefaults
+
+import XCTest
+import Foundation
+@testable import SwiftLM
+@testable import MLXInferenceCore
+
+final class GenerationConfigPersistenceTests: XCTestCase {
+
+    // Use an isolated UserDefaults suite so tests never touch the real suite
+    // and don't interfere with each other.
+    private var defaults: UserDefaults!
+    private let suiteName = "com.swiftlm.test.generationconfig.\(UUID().uuidString)"
+
+    override func setUp() {
+        super.setUp()
+        defaults = UserDefaults(suiteName: suiteName)!
+        defaults.removePersistentDomain(forName: suiteName)
+    }
+
+    override func tearDown() {
+        defaults.removePersistentDomain(forName: suiteName)
+        defaults = nil
+        super.tearDown()
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - 1. GenerationConfig Codable conformance
+    // ═══════════════════════════════════════════════════════════════════
+
+    func testGenerationConfig_IsCodable() throws {
+        // The Codable conformance that was added must round-trip without loss.
+        let config = GenerationConfig(
+            maxTokens: 4096,
+            temperature: 0.75,
+            topP: 0.9,
+            topK: 40,
+            minP: 0.05,
+            repetitionPenalty: 1.1,
+            enableThinking: true,
+            prefillSize: 256,
+            kvBits: 4,
+            kvGroupSize: 32
+        )
+        let data   = try JSONEncoder().encode(config)
+        let decoded = try JSONDecoder().decode(GenerationConfig.self, from: data)
+
+        XCTAssertEqual(decoded.maxTokens,         4096)
+        XCTAssertEqual(decoded.temperature,       0.75,  accuracy: 1e-4)
+        XCTAssertEqual(decoded.topP,              0.9,   accuracy: 1e-4)
+        XCTAssertEqual(decoded.topK,              40)
+        XCTAssertEqual(decoded.minP,              0.05,  accuracy: 1e-4)
+        XCTAssertEqual(decoded.repetitionPenalty, 1.1,   accuracy: 1e-4)
+        XCTAssertTrue(decoded.enableThinking)
+        XCTAssertEqual(decoded.prefillSize,       256)
+        XCTAssertEqual(decoded.kvBits,            4)
+        XCTAssertEqual(decoded.kvGroupSize,       32)
+    }
+
+    func testGenerationConfig_NilFieldsRoundTrip() throws {
+        // nil kvBits must survive the encode/decode cycle as nil, not 0.
+        let config = GenerationConfig(kvBits: nil)
+        let data   = try JSONEncoder().encode(config)
+        let decoded = try JSONDecoder().decode(GenerationConfig.self, from: data)
+        XCTAssertNil(decoded.kvBits, "kvBits nil must survive round-trip")
+    }
+
+    func testGenerationConfig_DefaultValues() {
+        let config = GenerationConfig.default
+        XCTAssertEqual(config.maxTokens,         2048)
+        XCTAssertEqual(config.temperature,       0.6,  accuracy: 1e-4)
+        XCTAssertEqual(config.topP,              1.0,  accuracy: 1e-4)
+        XCTAssertEqual(config.topK,              50)
+        XCTAssertEqual(config.minP,              0.0,  accuracy: 1e-4)
+        XCTAssertEqual(config.repetitionPenalty, 1.05, accuracy: 1e-4)
+        XCTAssertFalse(config.enableThinking,    "Thinking must be OFF by default")
+        XCTAssertEqual(config.prefillSize,       512)
+        XCTAssertNil(config.kvBits)
+        XCTAssertEqual(config.kvGroupSize,       64)
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - 2. UserDefaults persistence (save / load)
+    // ═══════════════════════════════════════════════════════════════════
+    // NOTE: GenerationConfig.save()/load() use UserDefaults.standard internally.
+    // These tests exercise the Codable round-trip via JSONEncoder/Decoder as a
+    // proxy for the persistence contract, isolating from the real suite.
+
+    func testGenerationConfig_SaveLoad_RoundTrip() throws {
+        // Simulate what save() encodes and what load() decodes.
+        let original = GenerationConfig(
+            maxTokens: 512, temperature: 0.3, enableThinking: true, kvBits: 8
+        )
+        let data    = try JSONEncoder().encode(original)
+        let decoded = try JSONDecoder().decode(GenerationConfig.self, from: data)
+
+        XCTAssertEqual(decoded.maxTokens,   512)
+        XCTAssertEqual(decoded.temperature, 0.3, accuracy: 1e-4)
+        XCTAssertTrue(decoded.enableThinking)
+        XCTAssertEqual(decoded.kvBits,      8)
+    }
+
+    func testGenerationConfig_RestoredFields_PresentWithCorrectDefaults() throws {
+        // turboKV and streamExperts were restored as fully-wired fields:
+        //   turboKV      → per-request (sets KVCacheSimple.turboQuantEnabled)
+        //   streamExperts → load-time (controls ExpertStreamingConfig activation)
+        // This test verifies they are present in the schema with correct defaults
+        // (both off by default, user opt-in).
+        let data = try JSONEncoder().encode(GenerationConfig.default)
+        let json = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+
+        // Both fields must be present in the encoded JSON
+        XCTAssertNotNil(json["turboKV"],
+                        "turboKV must be present in GenerationConfig JSON — it is wired to KVCacheSimple.turboQuantEnabled")
+        XCTAssertNotNil(json["streamExperts"],
+                        "streamExperts must be present in GenerationConfig JSON — it controls ExpertStreamingConfig at load time")
+
+        // Both must default to false (user must explicitly opt in)
+        XCTAssertEqual(json["turboKV"] as? Bool, false,
+                       "turboKV default must be false — user opt-in for 100k+ context workloads")
+        XCTAssertEqual(json["streamExperts"] as? Bool, false,
+                       "streamExperts default must be false — auto-enabled via isMoE for catalog MoE models")
+    }
+
+    func testGenerationConfig_Load_FallsBackToDefault_WhenNoStoredData() {
+        // load() with no stored data must return .default, not crash.
+        // We test this by ensuring no data is in a fresh suite.
+        let freshDefaults = UserDefaults(suiteName: "com.swiftlm.test.fresh.\(UUID().uuidString)")!
+        defer { freshDefaults.removePersistentDomain(forName: "com.swiftlm.test.fresh.\(UUID().uuidString)") }
+
+        // The static load() reads UserDefaults.standard, so we verify the
+        // fallback contract by checking that .default is a valid config.
+        let fallback = GenerationConfig.default
+        XCTAssertEqual(fallback.maxTokens, 2048, "Fallback must be .default")
+        XCTAssertFalse(fallback.enableThinking)
+    }
+
+    func testGenerationConfig_Save_ProducesDecodableJSON() throws {
+        // Verify save() produces data that JSONDecoder can re-read —
+        // i.e. the codec is symmetric and doesn't use unsupported types.
+        let config = GenerationConfig(temperature: 0.88, enableThinking: true)
+        let data   = try JSONEncoder().encode(config)
+        XCTAssertFalse(data.isEmpty, "Encoded data must not be empty")
+        // Must be valid JSON
+        let json = try XCTUnwrap(try JSONSerialization.jsonObject(with: data) as? [String: Any])
+        XCTAssertEqual(json["enableThinking"] as? Bool, true)
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - 3. Thinking mode — enable_thinking additionalContext
+    // ═══════════════════════════════════════════════════════════════════
+    // The engine now passes `additionalContext: ["enable_thinking": Bool]`
+    // to UserInput so Qwen3's Jinja template actually generates <think> blocks.
+    // We test the mapping logic in isolation by verifying the config flag
+    // drives the correct boolean value.
+
+    func testThinkingConfig_EnabledWhenFlagIsTrue() {
+        let config = GenerationConfig(enableThinking: true)
+        // Replicate the production mapping from InferenceEngine.generate()
+        let additionalContext: [String: Any] = config.enableThinking
+            ? ["enable_thinking": true]
+            : ["enable_thinking": false]
+        XCTAssertEqual(additionalContext["enable_thinking"] as? Bool, true,
+                       "enable_thinking must be true when config.enableThinking is true")
+    }
+
+    func testThinkingConfig_DisabledWhenFlagIsFalse() {
+        let config = GenerationConfig(enableThinking: false)
+        let additionalContext: [String: Any] = config.enableThinking
+            ? ["enable_thinking": true]
+            : ["enable_thinking": false]
+        XCTAssertEqual(additionalContext["enable_thinking"] as? Bool, false,
+                       "enable_thinking must be false when config.enableThinking is false")
+    }
+
+    func testThinkingConfig_DefaultIsDisabled() {
+        // Prevents a future change to the default from silently enabling
+        // thinking on all requests without the user opting in.
+        XCTAssertFalse(GenerationConfig.default.enableThinking,
+                       "Thinking must be OFF by default — opt-in only")
+    }
+
+    func testThinkingConfig_ToggleRoundTrips_ViaCodable() throws {
+        // Verify enableThinking survives encode/decode (regression guard for
+        // future Codable migrations that might lose Bool fields).
+        for value in [true, false] {
+            let config  = GenerationConfig(enableThinking: value)
+            let data    = try JSONEncoder().encode(config)
+            let decoded = try JSONDecoder().decode(GenerationConfig.self, from: data)
+            XCTAssertEqual(decoded.enableThinking, value,
+                           "enableThinking=\(value) must survive Codable round-trip")
+        }
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - 4. /v1/chat/completions request parsing
+    // ═══════════════════════════════════════════════════════════════════
+    // Validates the JSON→ChatMessage mapping and per-request override logic
+    // used by the ServerManager endpoint, isolated from the HTTP layer.
+
+    /// Mirrors the production message-mapping logic in ServerManager.
+    private func mapMessages(_ msgs: [[String: Any]]) -> [ChatMessage] {
+        msgs.map { m in
+            let role    = m["role"]    as? String ?? "user"
+            let content = m["content"] as? String ?? ""
+            switch role {
+            case "system":    return ChatMessage.system(content)
+            case "assistant": return ChatMessage.assistant(content)
+            default:          return ChatMessage.user(content)
+            }
+        }
+    }
+
+    /// Mirrors the per-request config-override logic in ServerManager.
+    private func applyOverrides(_ json: [String: Any], to base: GenerationConfig) -> GenerationConfig {
+        var cfg = base
+        if let t  = json["temperature"]       as? Double { cfg.temperature        = Float(t) }
+        if let p  = json["top_p"]             as? Double { cfg.topP               = Float(p) }
+        if let mt = json["max_tokens"]        as? Int    { cfg.maxTokens           = mt }
+        if let rp = json["frequency_penalty"] as? Double { cfg.repetitionPenalty  = Float(rp) }
+        return cfg
+    }
+
+    func testChatEndpoint_MessageMapping_SystemUserAssistant() {
+        let msgs: [[String: Any]] = [
+            ["role": "system",    "content": "You are helpful."],
+            ["role": "user",      "content": "Hello!"],
+            ["role": "assistant", "content": "Hi there!"],
+        ]
+        let mapped = mapMessages(msgs)
+        XCTAssertEqual(mapped.count, 3)
+        XCTAssertEqual(mapped[0].role, .system)
+        XCTAssertEqual(mapped[0].content, "You are helpful.")
+        XCTAssertEqual(mapped[1].role, .user)
+        XCTAssertEqual(mapped[1].content, "Hello!")
+        XCTAssertEqual(mapped[2].role, .assistant)
+        XCTAssertEqual(mapped[2].content, "Hi there!")
+    }
+
+    func testChatEndpoint_UnknownRoleMapsToUser() {
+        // Any unknown role (e.g. "function") should fall through to .user
+        // rather than crashing — matches the production `default:` branch.
+        let msgs: [[String: Any]] = [["role": "function", "content": "result"]]
+        let mapped = mapMessages(msgs)
+        XCTAssertEqual(mapped[0].role, .user)
+    }
+
+    func testChatEndpoint_MissingContentDefaultsToEmpty() {
+        let msgs: [[String: Any]] = [["role": "user"]]   // no "content" key
+        let mapped = mapMessages(msgs)
+        XCTAssertEqual(mapped[0].content, "")
+    }
+
+    func testChatEndpoint_PerRequestOverrides_AppliedCorrectly() {
+        let base = GenerationConfig.default
+        let json: [String: Any] = [
+            "temperature": 0.2,
+            "top_p": 0.85,
+            "max_tokens": 512,
+            "frequency_penalty": 1.3,
+        ]
+        let result = applyOverrides(json, to: base)
+        XCTAssertEqual(result.temperature,       0.2,  accuracy: 1e-4)
+        XCTAssertEqual(result.topP,              0.85, accuracy: 1e-4)
+        XCTAssertEqual(result.maxTokens,         512)
+        XCTAssertEqual(result.repetitionPenalty, 1.3,  accuracy: 1e-4)
+    }
+
+    func testChatEndpoint_PerRequestOverrides_DoNotAffectUnspecifiedFields() {
+        // Overriding temperature must not silently reset enableThinking or kvBits.
+        var base = GenerationConfig.default
+        base.enableThinking = true
+        base.kvBits = 4
+
+        let json: [String: Any] = ["temperature": 0.5]
+        let result = applyOverrides(json, to: base)
+
+        XCTAssertTrue(result.enableThinking,
+                      "enableThinking must survive a temperature-only override")
+        XCTAssertEqual(result.kvBits, 4,
+                       "kvBits must survive a temperature-only override")
+    }
+
+    func testChatEndpoint_EmptyOverrideDict_LeavesConfigUnchanged() {
+        let base   = GenerationConfig(maxTokens: 1234, temperature: 0.42)
+        let result = applyOverrides([:], to: base)
+        XCTAssertEqual(result.maxTokens,   1234)
+        XCTAssertEqual(result.temperature, 0.42, accuracy: 1e-4)
+    }
+
+    func testChatEndpoint_StreamFlag_DefaultsToFalse() {
+        // Requests without "stream" must not stream — the endpoint defaults to
+        // non-streaming, matching the OpenAI spec.
+        let json: [String: Any] = ["model": "local", "messages": []]
+        let streamRequested = json["stream"] as? Bool ?? false
+        XCTAssertFalse(streamRequested,
+                       "Missing 'stream' key must default to non-streaming")
+    }
+
+    func testChatEndpoint_StreamFlag_ExplicitTrue() {
+        let json: [String: Any] = ["stream": true]
+        let streamRequested = json["stream"] as? Bool ?? false
+        XCTAssertTrue(streamRequested)
+    }
+}
diff --git a/tests/SwiftLMTests/SwiftBuddyServerTests.swift b/tests/SwiftLMTests/SwiftBuddyServerTests.swift
new file mode 100644
index 00000000..d1f3a4d8
--- /dev/null
+++ b/tests/SwiftLMTests/SwiftBuddyServerTests.swift
@@ -0,0 +1,413 @@
+// SwiftBuddyServerTests.swift — Tests for the SwiftBuddy embedded /v1/* endpoints
+//
+// The production SwiftLM Server.swift already serves /v1/chat/completions and is
+// what OpenCode uses. This suite covers the NEW embedded server we added in PR #99
+// inside ServerManager.swift — a separate Hummingbird instance running inside the
+// SwiftBuddy app itself for direct API access when the app is running.
+//
+// Because the embedded server requires a running SwiftBuddy app + InferenceEngine,
+// these tests focus on the JSON parsing and response-shape logic that can be
+// exercised in isolation (same strategy as ChatRequestParsingTests).
+
+import XCTest
+import Foundation
+@testable import SwiftLM
+@testable import MLXInferenceCore
+
+final class SwiftBuddyServerTests: XCTestCase {
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - /v1/models response shape
+    // ═══════════════════════════════════════════════════════════════════
+    // The embedded server's /v1/models route must return the OpenAI-compatible
+    // list schema so that clients like OpenCode, Continue, and the OpenAI SDK
+    // can discover the available model without special-casing.
+
+    func testModelsResponse_MatchesOpenAISchema() throws {
+        // Replicate the JSON body produced by the /v1/models handler.
+        // Normally this returns `engine.currentModelId ?? "local"`.
+        let modelId = "mlx-community/Qwen3.5-4B-MLX-4bit"
+
+        // Build the response body the same way ServerManager does
+        let body: [String: Any] = [
+            "object": "list",
+            "data": [[
+                "id": modelId,
+                "object": "model",
+                "owned_by": "swiftbuddy"
+            ]]
+        ]
+        let data = try JSONSerialization.data(withJSONObject: body)
+        let decoded = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+
+        XCTAssertEqual(decoded["object"] as? String, "list",
+                       "/v1/models must have top-level 'object': 'list'")
+        let modelList = try XCTUnwrap(decoded["data"] as? [[String: Any]])
+        XCTAssertEqual(modelList.count, 1)
+        XCTAssertEqual(modelList[0]["id"] as? String, modelId,
+                       "Model entry must carry the loaded model ID")
+        XCTAssertEqual(modelList[0]["object"] as? String, "model",
+                       "Each model entry must have 'object': 'model'")
+    }
+
+    func testModelsResponse_FallsBackToLocalWhenNoModelLoaded() throws {
+        // When no model is loaded, the handler returns "local" as the fallback.
+        // Clients must still receive a valid list structure.
+        let body: [String: Any] = [
+            "object": "list",
+            "data": [[
+                "id": "local",
+                "object": "model",
+                "owned_by": "swiftbuddy"
+            ]]
+        ]
+        let data = try JSONSerialization.data(withJSONObject: body)
+        let decoded = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+        let modelList = try XCTUnwrap(decoded["data"] as? [[String: Any]])
+        XCTAssertEqual(modelList[0]["id"] as? String, "local")
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - /v1/chat/completions SSE wire format (embedded server)
+    // ═══════════════════════════════════════════════════════════════════
+    // Tests the SSE chunk format used by the SwiftBuddy embedded server.
+    // The production Server.swift SSE format is already tested in ServerSSETests;
+    // these guard the embedded server's specific encoding.
+
+    /// Builds the SSE delta string the embedded server emits for each token.
+    private func makeDeltaChunk(id: String, modelId: String, delta: String, finishReason: String? = nil) -> String {
+        let finishReasonJSON = finishReason.map { "\"\($0)\"" } ?? "null"
+        let escaped = delta
+            .replacingOccurrences(of: "\\", with: "\\\\")
+            .replacingOccurrences(of: "\"", with: "\\\"")
+            .replacingOccurrences(of: "\n", with: "\\n")
+            .replacingOccurrences(of: "\r", with: "\\r")
+        return """
+        data: {"id":"\(id)","object":"chat.completion.chunk","model":"\(modelId)","choices":[{"index":0,"delta":{"role":"assistant","content":"\(escaped)"},"finish_reason":\(finishReasonJSON)}]}\r\n\r\n
+        """
+    }
+
+    func testSSEDeltaChunk_HasCorrectPrefix() {
+        let chunk = makeDeltaChunk(id: "sb-1", modelId: "qwen3", delta: "Hello")
+        XCTAssertTrue(chunk.hasPrefix("data: "),
+                      "SSE chunk must start with 'data: '")
+        XCTAssertTrue(chunk.hasSuffix("\r\n\r\n"),
+                      "SSE chunk must end with CRLF CRLF")
+    }
+
+    func testSSEDeltaChunk_JSONShape() throws {
+        let chunk = makeDeltaChunk(id: "sb-42", modelId: "test-model", delta: "Hi!")
+        let jsonStr = String(chunk.dropFirst("data: ".count).dropLast("\r\n\r\n".count))
+        let data = try XCTUnwrap(jsonStr.data(using: .utf8))
+        let json = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+
+        XCTAssertEqual(json["object"] as? String, "chat.completion.chunk",
+                       "Streaming chunk must have object = chat.completion.chunk")
+        XCTAssertEqual(json["id"] as? String, "sb-42")
+        XCTAssertEqual(json["model"] as? String, "test-model")
+
+        let choices = try XCTUnwrap(json["choices"] as? [[String: Any]])
+        XCTAssertEqual(choices.count, 1)
+        XCTAssertEqual(choices[0]["index"] as? Int, 0)
+
+        let delta = try XCTUnwrap(choices[0]["delta"] as? [String: Any])
+        XCTAssertEqual(delta["content"] as? String, "Hi!")
+        XCTAssertEqual(delta["role"] as? String, "assistant")
+    }
+
+    func testSSEDeltaChunk_EscapesSpecialCharacters() throws {
+        // Newlines and quotes inside delta content must be JSON-escaped.
+        let chunk = makeDeltaChunk(id: "sb-1", modelId: "m", delta: "line1\nline2")
+        XCTAssertFalse(chunk.contains("\nline2"),
+                       "Raw newline inside delta content must be JSON-escaped to \\n")
+        let jsonStr = String(chunk.dropFirst("data: ".count).dropLast("\r\n\r\n".count))
+        let data = try XCTUnwrap(jsonStr.data(using: .utf8))
+        let json = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+        let choices = try XCTUnwrap(json["choices"] as? [[String: Any]])
+        let delta = try XCTUnwrap(choices[0]["delta"] as? [String: Any])
+        XCTAssertEqual(delta["content"] as? String, "line1\nline2",
+                       "JSON decoder must restore newline correctly after escaping")
+    }
+
+    func testSSEDoneTerminator_Format() {
+        // The final SSE event must be `data: [DONE]` per OpenAI spec.
+        let doneEvent = "data: [DONE]\r\n\r\n"
+        XCTAssertTrue(doneEvent.hasPrefix("data: [DONE]"),
+                      "[DONE] terminator must follow OpenAI SSE spec")
+        XCTAssertTrue(doneEvent.hasSuffix("\r\n\r\n"))
+    }
+
+    func testSSEDeltaChunk_FinishReasonNull_DuringStreaming() throws {
+        let chunk = makeDeltaChunk(id: "sb-1", modelId: "m", delta: "token", finishReason: nil)
+        let jsonStr = String(chunk.dropFirst("data: ".count).dropLast("\r\n\r\n".count))
+        let data = try XCTUnwrap(jsonStr.data(using: .utf8))
+        let json = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+        let choices = try XCTUnwrap(json["choices"] as? [[String: Any]])
+        // finish_reason must be JSON null during streaming (not the string "null")
+        let finishReason = choices[0]["finish_reason"]
+        XCTAssertTrue(finishReason is NSNull, "finish_reason must be JSON null during streaming")
+    }
+
+    func testSSEDeltaChunk_FinishReasonStop_AtEnd() throws {
+        let chunk = makeDeltaChunk(id: "sb-1", modelId: "m", delta: "", finishReason: "stop")
+        let jsonStr = String(chunk.dropFirst("data: ".count).dropLast("\r\n\r\n".count))
+        let data = try XCTUnwrap(jsonStr.data(using: .utf8))
+        let json = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+        let choices = try XCTUnwrap(json["choices"] as? [[String: Any]])
+        XCTAssertEqual(choices[0]["finish_reason"] as? String, "stop",
+                       "finish_reason must be 'stop' on the final token chunk")
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - CLI command builder
+    // ═══════════════════════════════════════════════════════════════════
+
+    func testCLIBuilder_DefaultsOmitNonDefaultFlags() {
+        let cmd = buildCLICommand(
+            config: .default,
+            host: "127.0.0.1", port: 5413,
+            parallel: 1, apiKeySet: false,
+            modelId: "mlx-community/Qwen3"
+        )
+        XCTAssertTrue(cmd.contains("--model mlx-community/Qwen3"))
+        XCTAssertTrue(cmd.contains("--host 127.0.0.1"))
+        XCTAssertTrue(cmd.contains("--port 5413"))
+        // Defaults should be omitted to keep the command readable
+        XCTAssertFalse(cmd.contains("--top-p"),   "top-p=1.0 is default — should be omitted")
+        XCTAssertFalse(cmd.contains("--top-k"),   "top-k=50 is default — should be omitted")
+        XCTAssertFalse(cmd.contains("--min-p"),   "min-p=0 is default — should be omitted")
+        XCTAssertFalse(cmd.contains("--thinking"),"thinking=false is default — should be omitted")
+        XCTAssertFalse(cmd.contains("--parallel"),"parallel=1 is default — should be omitted")
+        XCTAssertFalse(cmd.contains("--api-key"), "no key set — should be omitted")
+        XCTAssertFalse(cmd.contains("--seed"),    "seed=nil is default — should be omitted")
+        XCTAssertFalse(cmd.contains("--kv-bits"), "kvBits=nil is default — should be omitted")
+    }
+
+    func testCLIBuilder_NonDefaultsFlagsEmitted() {
+        var cfg = GenerationConfig.default
+        cfg.topP              = 0.9
+        cfg.topK              = 40
+        cfg.minP              = 0.05
+        cfg.enableThinking    = true
+        cfg.seed              = 42
+        cfg.kvBits            = 4
+        cfg.kvGroupSize       = 32
+        cfg.prefillSize       = 256
+        cfg.repetitionPenalty = 1.2
+
+        let cmd = buildCLICommand(
+            config: cfg,
+            host: "0.0.0.0", port: 8080,
+            parallel: 4, apiKeySet: true,
+            modelId: "mlx-community/Qwen3-35B-MoE"
+        )
+
+        XCTAssertTrue(cmd.contains("--top-p 0.90"))
+        XCTAssertTrue(cmd.contains("--top-k 40"))
+        XCTAssertTrue(cmd.contains("--min-p 0.05"))
+        XCTAssertTrue(cmd.contains("--thinking"))
+        XCTAssertTrue(cmd.contains("--seed 42"))
+        XCTAssertTrue(cmd.contains("--kv-bits 4"))
+        XCTAssertTrue(cmd.contains("--kv-group-size 32"))
+        XCTAssertTrue(cmd.contains("--prefill-size 256"))
+        XCTAssertTrue(cmd.contains("--repeat-penalty 1.20"))
+        XCTAssertTrue(cmd.contains("--parallel 4"))
+        XCTAssertTrue(cmd.contains("--api-key <redacted>"))
+    }
+
+    func testCLIBuilder_NoModelId_UsesPlaceholder() {
+        let cmd = buildCLICommand(
+            config: .default,
+            host: "127.0.0.1", port: 5413,
+            parallel: 1, apiKeySet: false,
+            modelId: nil
+        )
+        XCTAssertTrue(cmd.contains("--model <model-id>"),
+                      "When no model is loaded, CLI must show a placeholder")
+    }
+
+    func testCLIBuilder_KvBitsDefault_DoesNotEmitGroupSize() {
+        // If kvBits is nil, kv-group-size must also be suppressed
+        // even if kvGroupSize is non-default — it has no effect without kvBits.
+        var cfg = GenerationConfig.default
+        cfg.kvBits = nil
+        cfg.kvGroupSize = 32  // non-default but irrelevant without kvBits
+        let cmd = buildCLICommand(config: cfg, host: "127.0.0.1", port: 5413,
+                                  parallel: 1, apiKeySet: false, modelId: "m")
+        XCTAssertFalse(cmd.contains("--kv-group-size"),
+                       "kv-group-size must not appear when kvBits is nil")
+    }
+
+    func testCLIBuilder_OutputStartsWithSwiftRunSwiftLM() {
+        let cmd = buildCLICommand(config: .default, host: "127.0.0.1", port: 5413,
+                                  parallel: 1, apiKeySet: false, modelId: "m")
+        XCTAssertTrue(cmd.hasPrefix("swift run SwiftLM"),
+                      "CLI command must start with 'swift run SwiftLM'")
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - jsonEscape completeness (C3 — Copilot review)
+    // ═══════════════════════════════════════════════════════════════════
+    // The old implementation only escaped \\ \" \n \r \t.
+    // JSONEncoder correctly handles U+0000–U+001F and all other control chars.
+
+    /// Replicates the fixed jsonEscape using JSONEncoder (same as ServerManager).
+    private func jsonEscape(_ s: String) -> String {
+        guard let data = try? JSONEncoder().encode(s),
+              let raw = String(data: data, encoding: .utf8) else { return "\"\"" }
+        return String(raw.dropFirst().dropLast())
+    }
+
+    func testJsonEscape_BasicChars() {
+        XCTAssertEqual(jsonEscape("hello"), "hello")
+        XCTAssertEqual(jsonEscape("say \"hi\""), #"say \"hi\""#)
+        XCTAssertEqual(jsonEscape("a\\b"), #"a\\b"#)
+        XCTAssertEqual(jsonEscape("line1\nline2"), #"line1\nline2"#)
+        XCTAssertEqual(jsonEscape("col1\tcol2"), #"col1\tcol2"#)
+    }
+
+    func testJsonEscape_ControlCharsU0000toU001F() {
+        // The old manual escape missed U+0000–U+001F beyond \n/\r/\t.
+        // JSONEncoder emits \u0000, \u0001, … for these.
+        let nullChar = "\u{00}"       // U+0000 NULL
+        let escaped = jsonEscape(nullChar)
+        XCTAssertFalse(escaped.contains("\u{00}"),
+                       "NULL byte must be escaped — raw U+0000 breaks JSON parsers")
+        // JSONEncoder emits \\u0000 for U+0000
+        XCTAssertTrue(escaped.contains("\\u0000") || escaped.contains("\\u"),
+                      "NULL must be encoded as a JSON unicode escape")
+
+        let bell = "\u{07}"           // U+0007 BELL — not escaped by the old implementation
+        let escapedBell = jsonEscape(bell)
+        XCTAssertFalse(escapedBell.contains("\u{07}"),
+                       "BELL (U+0007) must be escaped — old jsonEscape missed this")
+    }
+
+    func testJsonEscape_ProducesValidJSONWhenInterpolated() throws {
+        // Simulate the SSE chunk build: if escape is correct the whole string
+        // must parse as valid JSON.
+        let dangerousToken = "say \"\u{01}hello\u{08}\" done\n"
+        let escaped = jsonEscape(dangerousToken)
+        let json = "{\"content\":\"\(escaped)\"}"
+        let data = try XCTUnwrap(json.data(using: .utf8))
+        let parsed = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+        XCTAssertEqual(parsed["content"] as? String, dangerousToken,
+                       "Round-trip through jsonEscape must preserve original string content")
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - /v1/models modelId JSON safety (C5 — Copilot review)
+    // ═══════════════════════════════════════════════════════════════════
+
+    func testModelsResponse_ModelIdWithQuotes_IsJsonSafe() throws {
+        // A model ID that contains quotes would break naive interpolation.
+        // swiftBuddyJSONString wraps the value with JSONEncoder, making it safe.
+        let dangerousId = "model\"with\"quotes"
+        // Simulate the fixed /v1/models body build
+        let encodedId = try XCTUnwrap(
+            String(data: JSONEncoder().encode(dangerousId), encoding: .utf8)
+        )
+        let body = "{\"object\":\"list\",\"data\":[{\"id\":\(encodedId),\"object\":\"model\",\"owned_by\":\"swiftbuddy\"}]}"
+        let data = try XCTUnwrap(body.data(using: .utf8))
+        let parsed = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+        let modelList = try XCTUnwrap(parsed["data"] as? [[String: Any]])
+        XCTAssertEqual(modelList[0]["id"] as? String, dangerousId,
+                       "Model ID with embedded quotes must survive JSON round-trip safely")
+    }
+
+    func testModelsResponse_SlashInModelId_IsSafe() throws {
+        // Standard HF model IDs contain slashes — they must not break JSON.
+        let modelId = "mlx-community/Qwen3.5-122B-A10B-4bit"
+        let encodedId = try XCTUnwrap(
+            String(data: JSONEncoder().encode(modelId), encoding: .utf8)
+        )
+        let body = "{\"object\":\"list\",\"data\":[{\"id\":\(encodedId),\"object\":\"model\",\"owned_by\":\"swiftbuddy\"}]}"
+        let data = try XCTUnwrap(body.data(using: .utf8))
+        let parsed = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
+        let modelList = try XCTUnwrap(parsed["data"] as? [[String: Any]])
+        XCTAssertEqual(modelList[0]["id"] as? String, modelId,
+                       "Standard HF model ID with slashes must parse correctly")
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - Seed UInt64 overflow guard (C1/C2 — Copilot review)
+    // ═══════════════════════════════════════════════════════════════════
+
+    func testSeed_RandomIsWithinIntMax() {
+        // The seed button generates UInt64.random(in: 0...UInt64(Int.max)).
+        // Verifies the range is safe for Int conversion in the Stepper binding.
+        for _ in 0..<1000 {
+            let seed = UInt64.random(in: 0...UInt64(Int.max))
+            XCTAssertNoThrow(
+                _ = Int(seed),   // would trap if seed > Int.max
+                "Randomly generated seed must be safely convertible to Int"
+            )
+            XCTAssertLessThanOrEqual(seed, UInt64(Int.max),
+                                     "Seed must not exceed Int.max — Stepper binding would overflow")
+        }
+    }
+
+    func testSeed_StepperBinding_ClampsSafely() {
+        // The Stepper get: binding uses min(seed, UInt64(Int.max)) to prevent overflow.
+        let oversizedSeed = UInt64(Int.max) + 1
+        let clamped = Int(min(oversizedSeed, UInt64(Int.max)))
+        XCTAssertEqual(clamped, Int.max,
+                       "Seeds larger than Int.max must be clamped, not crashed")
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - Role mapping: tool + developer (M1 — Copilot review)
+    // ═══════════════════════════════════════════════════════════════════
+
+    func testRoleMapping_ToolRoleMapsToChatMessageTool() {
+        // Replicate the fixed role-switch from ServerManager's /v1/chat/completions handler.
+        func mapRole(_ role: String, content: String) -> ChatMessage {
+            switch role {
+            case "system", "developer": return .system(content)
+            case "assistant":           return .assistant(content)
+            case "tool":                return .tool(content)
+            case "user":                return .user(content)
+            default:                    return .user(content)
+            }
+        }
+
+        let toolMsg = mapRole("tool", content: "function result")
+        XCTAssertEqual(toolMsg.role, .tool,
+                       "tool role must map to .tool, not .user — breaks OpenAI function-calling protocol")
+        XCTAssertNotEqual(toolMsg.role, .user,
+                          "tool role must NOT fall through to .user")
+    }
+
+    func testRoleMapping_DeveloperRoleMapsToSystem() {
+        func mapRole(_ role: String, content: String) -> ChatMessage {
+            switch role {
+            case "system", "developer": return .system(content)
+            case "assistant":           return .assistant(content)
+            case "tool":                return .tool(content)
+            case "user":                return .user(content)
+            default:                    return .user(content)
+            }
+        }
+
+        let devMsg = mapRole("developer", content: "You are a coding assistant.")
+        XCTAssertEqual(devMsg.role, .system,
+                       "developer role (OpenAI Responses API) must map to .system")
+    }
+
+    func testRoleMapping_UnknownRoleFallsToUser() {
+        func mapRole(_ role: String, content: String) -> ChatMessage {
+            switch role {
+            case "system", "developer": return .system(content)
+            case "assistant":           return .assistant(content)
+            case "tool":                return .tool(content)
+            case "user":                return .user(content)
+            default:                    return .user(content)
+            }
+        }
+
+        let unknown = mapRole("function", content: "some output")
+        XCTAssertEqual(unknown.role, .user,
+                       "Unknown roles must fall back to .user (safe default)")
+    }
+}
+

From d280319146973718e7669a85d659606063a03236 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 1 May 2026 11:24:09 -0700
Subject: [PATCH 3/8] test: address Copilot review for Issue 97 by adding
 strict role mapping regression guards

---
 .../SwiftLMTests/ThinkingTagStripTests.swift  | 126 ++++++++++--------
 1 file changed, 67 insertions(+), 59 deletions(-)

diff --git a/tests/SwiftLMTests/ThinkingTagStripTests.swift b/tests/SwiftLMTests/ThinkingTagStripTests.swift
index b258f445..ec2bcd64 100644
--- a/tests/SwiftLMTests/ThinkingTagStripTests.swift
+++ b/tests/SwiftLMTests/ThinkingTagStripTests.swift
@@ -1,40 +1,20 @@
 // ThinkingTagStripTests.swift — Regression tests for Issue #97
 //
-// Verifies two fixes:
+// Verifies two fixes in InferenceEngine.generate():
 //   1. stripThinkingTags() correctly removes <think>…</think> blocks from
 //      assistant history messages so they never re-enter the Jinja template.
-//   2. The role mapping for "assistant" is NOT changed to "model" (Qwen3 fix).
-//
-// stripThinkingTags is private at file scope in InferenceEngine.swift, so we
-// mirror the exact implementation here — the same pattern used by
-// ChatRequestParsingTests for mapAssistantToolCalls.
+//   2. ChatMessage.Role raw values stay aligned with the OpenAI-compatible
+//      protocol strings (enum-level guard; see comment on MARK-4 for scope).
 
 import XCTest
 import Foundation
 @testable import SwiftLM
-import MLXInferenceCore
+@testable import MLXInferenceCore   // gives access to internal stripThinkingTags
 
 final class ThinkingTagStripTests: XCTestCase {
 
-    // ── Mirror of the production helper (InferenceEngine.swift) ───────────────
-    // Keep in sync if the production implementation changes.
-
-    private func stripThinkingTags(from text: String) -> String {
-        var result = text
-        while let openRange = result.range(of: "<think>") {
-            if let closeRange = result.range(of: "</think>", range: openRange.lowerBound..<result.endIndex) {
-                var endIdx = closeRange.upperBound
-                if endIdx < result.endIndex && result[endIdx] == "\n" {
-                    endIdx = result.index(after: endIdx)
-                }
-                result.removeSubrange(openRange.lowerBound..<endIdx)
-            } else {
-                result.removeSubrange(openRange.lowerBound...)
-                break
-            }
-        }
-        return result.trimmingCharacters(in: .whitespacesAndNewlines)
-    }
+    // stripThinkingTags is an internal free function in InferenceEngine.swift,
+    // accessed here via @testable import MLXInferenceCore — no mirror copy needed.
 
     // ═══════════════════════════════════════════════════════════════════
     // MARK: - 1. Basic stripping
@@ -50,9 +30,13 @@ final class ThinkingTagStripTests: XCTestCase {
         XCTAssertEqual(stripThinkingTags(from: input), "")
     }
 
-    func testStrip_NoThinkBlock_ReturnsTrimmedOriginal() {
+    func testStrip_NoThinkBlock_ReturnsOriginalUnchanged() {
+        // When no <think> tags are present the string must be returned
+        // byte-for-byte — leading indentation, code-block spaces, etc. must
+        // not be trimmed (Copilot review comment).
         let input = "  Hello, how can I help?  "
-        XCTAssertEqual(stripThinkingTags(from: input), "Hello, how can I help?")
+        XCTAssertEqual(stripThinkingTags(from: input), input,
+                       "Content without think tags must be returned unchanged (no trimming)")
     }
 
     func testStrip_MultipleThinkBlocks() {
@@ -77,19 +61,13 @@ final class ThinkingTagStripTests: XCTestCase {
     }
 
     func testStrip_MultilineThinkBlock() {
-        let input = """
-        <think>
-        Line one of reasoning.
-        Line two of reasoning.
-        </think>
-        The final answer.
-        """
+        let input = "<think>\nLine one of reasoning.\nLine two of reasoning.\n</think>\nThe final answer."
         XCTAssertEqual(stripThinkingTags(from: input), "The final answer.")
     }
 
     func testStrip_ThinkBlockWithTrailingNewline_ConsumesNewline() {
-        // The production helper eats the single newline after </think>
-        // so the visible content doesn't start with a blank line.
+        // The helper eats the single newline after </think> so the visible
+        // content doesn't start with a blank line.
         let input = "<think>thought</think>\nAnswer starts here"
         let result = stripThinkingTags(from: input)
         XCTAssertFalse(result.hasPrefix("\n"), "Result must not start with a stray newline")
@@ -97,8 +75,8 @@ final class ThinkingTagStripTests: XCTestCase {
     }
 
     func testStrip_ContentBeforeAndAfterThink() {
-        // Reproduces the exact shape of Qwen3 output with thinking ON:
-        // the UI shows the <think> block inline and the answer follows.
+        // Reproduces the exact shape of Qwen3 output from screenshot 2 (Issue #97):
+        // Russian tongue-twister reply with an inline <think> block.
         let input = "<think>\nThe user is asking me to continue a Russian tongue-twister.\nNo tool calls needed.\n</think>\nЕхал грека через реку,\nВидит грека — в реке рак."
         let result = stripThinkingTags(from: input)
         XCTAssertEqual(result, "Ехал грека через реку,\nВидит грека — в реке рак.")
@@ -110,7 +88,7 @@ final class ThinkingTagStripTests: XCTestCase {
 
     func testStrip_Issue97_SecondTurnMessageShape() {
         // This is the exact assistant content that caused TemplateException error 1
-        // when fed back unmodified into the Jinja template on turn 2.
+        // when fed back unmodified into the Jinja template on turn 2 (screenshot 1).
         let turn1AssistantOutput = """
         <think>
         The user said "Hi!" as a greeting. Let me check my available tools and context. \
@@ -120,38 +98,68 @@ final class ThinkingTagStripTests: XCTestCase {
         """
         let stripped = stripThinkingTags(from: turn1AssistantOutput)
 
-        // After stripping, no <think> tag should remain
-        XCTAssertFalse(stripped.contains("<think>"), "Stripped content must not contain <think>")
-        XCTAssertFalse(stripped.contains("</think>"), "Stripped content must not contain </think>")
-
-        // The visible reply must be preserved
-        XCTAssertTrue(stripped.contains("Hello!"), "Visible reply must survive stripping")
+        XCTAssertFalse(stripped.contains("<think>"),   "Stripped content must not contain <think>")
+        XCTAssertFalse(stripped.contains("</think>"),  "Stripped content must not contain </think>")
+        XCTAssertTrue(stripped.contains("Hello!"),     "Visible reply must survive stripping")
     }
 
     // ═══════════════════════════════════════════════════════════════════
-    // MARK: - 4. Role mapping regression guard (Issue #97)
+    // MARK: - 4. Role mapping regression guard (Issue #97 — Copilot review)
     // ═══════════════════════════════════════════════════════════════════
-    // The ChatCompletionRequest pipeline in Server.swift passes roles through
-    // as-is. The InferenceEngine must NOT remap "assistant" → "model" because
-    // Qwen3's Jinja template only recognises "assistant" and throws
-    // TemplateException error 1 on any unrecognised role value.
-
-    func testRoleMapping_AssistantRawValue_IsAssistant() {
-        // ChatMessage.Role.assistant.rawValue must stay "assistant" so that
-        // the role is correctly passed to applyChatTemplate.
-        // If someone changes the enum rawValue, this test fails loudly.
+    // Copilot noted that asserting `ChatMessage.Role.assistant.rawValue == "assistant"`
+    // only protects the enum definition; it would NOT catch a runtime remap
+    // such as `if roleRaw == "assistant" { roleRaw = "model" }` being silently
+    // re-introduced inside InferenceEngine.generate().
+    //
+    // The structural test below replicates the production message-preparation
+    // path and asserts the wire dict role is "assistant", not "model".
+
+    func testChatMessageRoleRawValue_Assistant_IsAssistant() {
         XCTAssertEqual(
             ChatMessage.Role.assistant.rawValue,
             "assistant",
-            "Role.assistant rawValue must be 'assistant', not 'model' — Qwen3 Jinja template fix (Issue #97)"
+            "Role.assistant rawValue must be 'assistant' — Issue #97 enum raw-value guard"
         )
     }
 
-    func testRoleMapping_AllRolesHaveExpectedRawValues() {
-        // Canonical role strings for the OpenAI-compatible message protocol.
+    func testChatMessageRoleRawValues_AllRolesMatchProtocolStrings() {
         XCTAssertEqual(ChatMessage.Role.system.rawValue,    "system")
         XCTAssertEqual(ChatMessage.Role.user.rawValue,      "user")
         XCTAssertEqual(ChatMessage.Role.assistant.rawValue, "assistant")
         XCTAssertEqual(ChatMessage.Role.tool.rawValue,      "tool")
     }
+
+    // Structural regression: replicates the wire-dict build in generate().
+    // An assistant ChatMessage must produce ["role": "assistant"], not
+    // ["role": "model"] — the Gemma-specific alias that broke Qwen3 (Issue #97).
+    func testRoleMapping_AssistantProducesAssistantNotModel_InWireDict() {
+        let messages: [ChatMessage] = [
+            .system("You are helpful."),
+            .user("Hello"),
+            .assistant("Hi there!"),
+        ]
+
+        // Replicate: let roleRaw = msg.role.rawValue (no further remapping)
+        var wireDicts: [[String: String]] = []
+        for msg in messages {
+            guard msg.role != .system else { continue }
+            let roleRaw = msg.role.rawValue
+            let content = stripThinkingTags(from: msg.content)
+            wireDicts.append(["role": roleRaw, "content": content])
+        }
+
+        XCTAssertEqual(wireDicts.count, 2)
+        XCTAssertEqual(wireDicts[0]["role"], "user")
+        XCTAssertEqual(wireDicts[1]["role"], "assistant",
+            "Assistant must map to 'assistant' in wire dict, not 'model' — Issue #97 runtime remap guard")
+        XCTAssertNotEqual(wireDicts[1]["role"], "model",
+            "Wire dict must never contain 'model' — Gemma-specific alias breaks Qwen3 chat template")
+    }
+
+    func testRoleMapping_ToolRoleIsPreservedInWireDict() {
+        let msg = ChatMessage.tool("function result")
+        XCTAssertEqual(msg.role.rawValue, "tool",
+            "Tool role must be 'tool' for OpenAI function-calling protocol")
+    }
 }
+

From ccf0b41d99e4c38b2e2ee3220ddf3a70e603c145 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 1 May 2026 11:24:09 -0700
Subject: [PATCH 4/8] fix(swiftbuddy): update SettingsView streaming UI and
 link CLI builder

---
 .../SwiftBuddy/Views/SettingsView.swift       | 133 +++++++++---------
 SwiftBuddy/generate_xcodeproj.py              |   3 +-
 2 files changed, 69 insertions(+), 67 deletions(-)

diff --git a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
index e54bad2a..3447096e 100644
--- a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
@@ -37,10 +37,19 @@ struct SettingsView: View {
         return ModelCatalog.all.first(where: { $0.id == modelId })?.isMoE ?? false
     }
 
+    private var currentModelId: String? {
+        guard case .ready(let modelId) = engine.state else { return nil }
+        return modelId
+    }
+
     private var effectiveStreamExpertsSetting: Bool {
         viewModel.config.effectiveStreamExperts(defaultingTo: currentModelIsMoE)
     }
 
+    private var needsModelReloadForStreamingChange: Bool {
+        effectiveStreamExpertsSetting != currentModelIsMoE
+    }
+
     private var ssdStreamingBinding: Binding<Bool> {
         Binding(
             get: { effectiveStreamExpertsSetting },
@@ -295,6 +304,9 @@ struct SettingsView: View {
                         tint: SwiftBuddyTheme.warning,
                         hint: "Stream MoE expert weights from NVMe (requires model reload)"
                     )
+                    if needsModelReloadForStreamingChange {
+                        modelReloadPrompt
+                    }
                     toggleRow(
                         label: "TurboQuant KV", icon: "bolt.badge.clock",
                         isOn: $viewModel.config.turboKV,
@@ -555,70 +567,8 @@ struct SettingsView: View {
                             tint: SwiftBuddyTheme.accentSecondary,
                             hint: "mmap expert weights from NVMe — only active expert pages stay in RAM. Auto-enabled for MoE catalog models."
                         )
-                        if effectiveStreamExpertsSetting != currentModelIsMoE {
-                            VStack(alignment: .leading, spacing: 8) {
-                                HStack(spacing: 6) {
-                                    Image(systemName: "arrow.clockwise.circle.fill")
-                                        .foregroundStyle(SwiftBuddyTheme.warning)
-                                        .font(.caption)
-                                    Text("Reload model to apply this change")
-                                        .font(.caption2.weight(.medium))
-                                        .foregroundStyle(SwiftBuddyTheme.warning)
-                                    Spacer()
-                                    Button("Reload") {
-                                        let currentId: String? = {
-                                            if case .ready(let id) = engine.state { return id }
-                                            return nil
-                                        }()
-                                        if let id = currentId {
-                                            Task {
-                                                engine.unload()
-                                                await engine.load(modelId: id)
-                                            }
-                                        }
-                                    }
-                                    .font(.caption2.weight(.semibold))
-                                    .foregroundStyle(SwiftBuddyTheme.accent)
-                                    .buttonStyle(.plain)
-                                }
-
-                                switch engine.state {
-                                case .loading(let progress, let stage):
-                                    VStack(alignment: .leading, spacing: 4) {
-                                        HStack {
-                                            Text(stage)
-                                                .font(.caption2.weight(.medium))
-                                                .foregroundStyle(SwiftBuddyTheme.textSecondary)
-                                            Spacer()
-                                            Text("\(Int(progress * 100))%")
-                                                .font(.caption2.monospacedDigit())
-                                                .foregroundStyle(SwiftBuddyTheme.textTertiary)
-                                        }
-                                        ProgressView(value: progress)
-                                            .tint(SwiftBuddyTheme.accent)
-                                    }
-                                case .downloading(let progress, let speed):
-                                    VStack(alignment: .leading, spacing: 4) {
-                                        HStack {
-                                            Text("Downloading model files")
-                                                .font(.caption2.weight(.medium))
-                                                .foregroundStyle(SwiftBuddyTheme.textSecondary)
-                                            Spacer()
-                                            Text("\(Int(progress * 100))% · \(speed)")
-                                                .font(.caption2.monospacedDigit())
-                                                .foregroundStyle(SwiftBuddyTheme.textTertiary)
-                                        }
-                                        ProgressView(value: progress)
-                                            .tint(SwiftBuddyTheme.accent)
-                                    }
-                                default:
-                                    EmptyView()
-                                }
-                            }
-                            .padding(.horizontal, 4)
-                            .padding(.vertical, 6)
-                            .background(SwiftBuddyTheme.warning.opacity(0.08))
-                            .clipShape(RoundedRectangle(cornerRadius: 8))
+                        if needsModelReloadForStreamingChange {
+                            modelReloadPrompt
                         }
                     }
                 }
@@ -702,7 +652,7 @@ struct SettingsView: View {
                     }
                     .pickerStyle(.segmented)
                     .tint(SwiftBuddyTheme.accent)
-                    .onChange(of: localColorScheme) { newValue in
+                    .onChange(of: localColorScheme) { _, newValue in
                         // Defer the @Published write to avoid the view update crash
                         Task { @MainActor in
                             appearance.preference = newValue
@@ -917,7 +867,7 @@ struct SettingsView: View {
             port: server.port,
             parallel: server.startupConfiguration.parallelSlots,
             apiKeySet: !server.startupConfiguration.apiKey.isEmpty,
-            modelId: {
+            modelId: { () -> String? in
                 if case .ready(let id) = engine.state { return id }
                 return nil
             }()
@@ -981,6 +931,57 @@ struct SettingsView: View {
         }
     }
 
+    @ViewBuilder
+    private var modelReloadPrompt: some View {
+        VStack(alignment: .leading, spacing: 8) {
+            HStack(spacing: 6) {
+                Image(systemName: "arrow.clockwise.circle.fill")
+                    .foregroundStyle(SwiftBuddyTheme.warning)
+                    .font(.caption)
+                Text("Reload model to apply this change")
+                    .font(.caption2.weight(.medium))
+                    .foregroundStyle(SwiftBuddyTheme.warning)
+                Spacer()
+                Button("Reload") {
+                    reloadCurrentModel()
+                }
+                .font(.caption2.weight(.semibold))
+                .foregroundStyle(SwiftBuddyTheme.accent)
+                .buttonStyle(.plain)
+                .disabled(currentModelId == nil)
+            }
+
+            switch engine.state {
+            case .loading(let progress, let stage):
+                VStack(alignment: .leading, spacing: 4) {
+                    HStack {
+                        Text(stage)
+                            .font(.caption2.weight(.medium))
+                            .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                        Spacer()
+                        Text("\(Int(progress * 100))%")
+                            .font(.caption2.monospacedDigit())
+                            .foregroundStyle(SwiftBuddyTheme.textTertiary)
+                    }
+
+                    ProgressView(value: progress)
+                        .tint(SwiftBuddyTheme.accent)
+                        .controlSize(.small)
+                }
+            default:
+                EmptyView()
+            }
+        }
+    }
+
+    private func reloadCurrentModel() {
+        guard let currentModelId else { return }
+        Task {
+            engine.unload()
+            await engine.load(modelId: currentModelId)
+        }
+    }
+
     @ViewBuilder
     private func parameterCard<Content: View>(_ title: String, @ViewBuilder content: () -> Content) -> some View {
         VStack(alignment: .leading, spacing: 10) {
diff --git a/SwiftBuddy/generate_xcodeproj.py b/SwiftBuddy/generate_xcodeproj.py
index 38cc0537..a49537cc 100644
--- a/SwiftBuddy/generate_xcodeproj.py
+++ b/SwiftBuddy/generate_xcodeproj.py
@@ -70,6 +70,7 @@ def uid():
 # ── MLXInferenceCore sources (path relative to SwiftBuddy/)
 core_sources = [
     ("../Sources/MLXInferenceCore/ChatMessage.swift",          uid(), uid()),
+    ("../Sources/MLXInferenceCore/CLICommandBuilder.swift",    uid(), uid()),
     ("../Sources/MLXInferenceCore/GenerationConfig.swift",     uid(), uid()),
     ("../Sources/MLXInferenceCore/ModelCatalog.swift",         uid(), uid()),
     ("../Sources/MLXInferenceCore/ModelStorage.swift",         uid(), uid()),
@@ -512,7 +513,7 @@ def main():
     print("    • ../mlx-swift-lm             → MLXLLM, MLXLMCommon")
     print()
     print("📂  MLXInferenceCore sources included directly:")
-    for p, _, _ in [("ChatMessage", None, None), ("GenerationConfig", None, None),
+    for p, _, _ in [("ChatMessage", None, None), ("CLICommandBuilder", None, None), ("GenerationConfig", None, None),
                     ("ModelCatalog", None, None), ("ModelDownloadManager", None, None),
                     ("ModelArchitectureProbe", None, None), ("InferenceEngine", None, None)]:
         print(f"    • {p}.swift")

From 482782eca8cdb1e87fa61a5819b0bd18fb5577b2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sat, 2 May 2026 08:36:28 -0700
Subject: [PATCH 5/8] fix(swiftbuddy): resolve actor isolation violation in
 ServerManager

---
 .../SwiftBuddy/ViewModels/ServerManager.swift | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
index 3455304d..a3e8485b 100644
--- a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
+++ b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
@@ -130,7 +130,8 @@ final class ServerManager: ObservableObject {
         guard !isOnline else { return }
         let configuration = startupConfiguration.normalized
 
-        task = Task {
+        task = Task.detached { [weak self] in
+            guard let self = self else { return }
             do {
                 let router = Router()
 
@@ -259,18 +260,22 @@ final class ServerManager: ObservableObject {
                     configuration: .init(address: .hostname(configuration.host, port: configuration.port))
                 )
 
-                self.isOnline = true
-                self.host = configuration.host
-                self.port = configuration.port
-                self.runningConfiguration = configuration
-                self.restartRequired = false
+                await MainActor.run {
+                    self.isOnline = true
+                    self.host = configuration.host
+                    self.port = configuration.port
+                    self.runningConfiguration = configuration
+                    self.restartRequired = false
+                }
                 ConsoleLog.shared.info("Server online at http://\(configuration.host):\(configuration.port)")
 
                 try await app.runService()
             } catch {
                 print("Server failed: \(error)")
                 ConsoleLog.shared.error("Server failed: \(error.localizedDescription)")
-                self.isOnline = false
+                await MainActor.run {
+                    self.isOnline = false
+                }
             }
         }
     }

From 42f4946ece43813fb720fcee1d9c2fb46e5f5f12 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 4 May 2026 09:39:55 -0700
Subject: [PATCH 6/8] fix: resolve KVCacheSimple cast warning and
 ContextWindowCalculationTests build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- InferenceEngine: TurboKV now iterates KV cache layers (not model.modules())
  KVCacheSimple is not a Module subclass, so the cast always failed silently.
  Also switched to MLXLMCommon.generate(input:cache:parameters:context:) so the
  pre-built cache (with turboQuantEnabled flags) is actually used in generation.
- ContextWindowCalculationTests: replace MLXArray(stride:to:by:) with
  MLXArray(Array(0..<512)) — the stride initializer does not exist in this
  version of mlx-swift (MLXArray.init(_ value: Int) is the only Int overload).
---
 .../MLXInferenceCore/InferenceEngine.swift    | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/Sources/MLXInferenceCore/InferenceEngine.swift b/Sources/MLXInferenceCore/InferenceEngine.swift
index 28eb225c..27829eea 100644
--- a/Sources/MLXInferenceCore/InferenceEngine.swift
+++ b/Sources/MLXInferenceCore/InferenceEngine.swift
@@ -613,24 +613,27 @@ extension InferenceEngine {
                     
                     // maxContextWindow is already set during loadModel() from config.json
 
-                    // TurboKV: enable 3-bit PolarQuant+QJL on every KVCacheSimple layer
-                    // before generation. Must be set on the model (not the cache) so the
-                    // cache inherits the flag when newCache() is called inside generate().
+                    // TurboKV: enable 3-bit PolarQuant+QJL on every KVCacheSimple cache layer.
+                    // KVCacheSimple is a cache object (not a neural-network Module), so we
+                    // iterate the cache array — mirroring the pattern in Server.swift.
+                    let cache = await container.perform { ctx in ctx.model.newCache(parameters: params) }
                     if config.turboKV {
-                        await container.perform { ctx in
-                            for module in ctx.model.modules() {
-                                if let simple = module as? KVCacheSimple {
-                                    simple.turboQuantEnabled = true
-                                }
+                        for layer in cache {
+                            if let simple = layer as? KVCacheSimple {
+                                simple.turboQuantEnabled = true
                             }
                         }
                         print("[InferenceEngine] TurboKV enabled for this request")
                     }
 
-                    let stream: AsyncStream<Generation> = try await container.generate(
-                        input: lmInput,
-                        parameters: params
-                    )
+                    let stream: AsyncStream<Generation> = try await container.perform { ctx in
+                        try MLXLMCommon.generate(
+                            input: lmInput,
+                            cache: cache,
+                            parameters: params,
+                            context: ctx
+                        )
+                    }
 
                     for await generation in stream {
                         guard !Task.isCancelled else { break }

From a9abb2a15ff8899c0e08dbffae4f55f72f7488a8 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 4 May 2026 09:49:36 -0700
Subject: [PATCH 7/8] fix(tests): fix MLXArray init in
 ContextWindowCalculationTests for Linux CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use MLXArray(Array(0..<512)) instead of MLXArray(stride:to:by:) —
the stride initializer does not exist in this mlx-swift version.
Previous fix went to the case-insensitive macOS path Tests/ which
is the same file locally but different on the case-sensitive Linux runner.
---
 tests/SwiftBuddyTests/ContextWindowCalculationTests.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/SwiftBuddyTests/ContextWindowCalculationTests.swift b/tests/SwiftBuddyTests/ContextWindowCalculationTests.swift
index a3710f43..c5bb2345 100644
--- a/tests/SwiftBuddyTests/ContextWindowCalculationTests.swift
+++ b/tests/SwiftBuddyTests/ContextWindowCalculationTests.swift
@@ -15,7 +15,7 @@ final class ContextWindowCalculationTests: XCTestCase {
         // Mock a scenario where userInput prepares a chat template with large history.
         // We will directly instantiate LMInput and assert on its size.
         
-        let mockTokens = MLXArray(stride: 0, to: 512, by: 1)
+        let mockTokens = MLXArray(Array(0..<512))
         // If tokenizer batches it, shape could be [1, 512].
         let reshapedTokens = mockTokens.reshaped([1, 512])
         

From 7870b2fd801ee176e60ac07af9d113ec8f5134fd Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 4 May 2026 11:08:20 -0700
Subject: [PATCH 8/8] fix: address all 7 Copilot review comments on PR #101
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. GenerationConfigPersistenceTests: capture fresh suite name in variable
   so defer{ removePersistentDomain } removes the same suite it created.
2. SwiftBuddyServerTests: align /v1/models fallback test with prod 'none'
   (not 'local') — matches ServerManager .ready/default handler.
3. SwiftBuddyServerTests: remove 'role' from SSE delta helper — embedded
   ServerManager does not include role in streaming delta objects.
4. SettingsView: add .downloading case to modelReloadPrompt via shared
   progressRow() helper (download progress was invisible during model reload).
5. SettingsView: fix needsModelReloadForStreamingChange to track the
   stream-experts value at last model load (appliedStreamExperts @State),
   preventing false-positive reload prompts for MoE catalog models.
6. SettingsView: capture appliedStreamExperts on .onAppear and on engine
   .ready transitions via .onChange(of: engine.state).
7. ServerManager: clear runningConfiguration + restartRequired on startup
   failure so the UI doesn't show a stale 'running config' when offline.
---
 .../SwiftBuddy/ViewModels/ServerManager.swift |  2 +
 .../SwiftBuddy/Views/SettingsView.swift       | 53 +++++++++++++------
 .../GenerationConfigPersistenceTests.swift    |  5 +-
 .../SwiftLMTests/SwiftBuddyServerTests.swift  | 20 +++----
 4 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
index a3e8485b..f1e7182d 100644
--- a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
+++ b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
@@ -275,6 +275,8 @@ final class ServerManager: ObservableObject {
                 ConsoleLog.shared.error("Server failed: \(error.localizedDescription)")
                 await MainActor.run {
                     self.isOnline = false
+                    self.runningConfiguration = nil
+                    self.restartRequired = false
                 }
             }
         }
diff --git a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
index 3447096e..b644ff21 100644
--- a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
@@ -46,8 +46,13 @@ struct SettingsView: View {
         viewModel.config.effectiveStreamExperts(defaultingTo: currentModelIsMoE)
     }
 
+    // Tracks the stream-experts value that was in effect when the current model was loaded.
+    // A mismatch with `effectiveStreamExpertsSetting` means a reload is required.
+    @State private var appliedStreamExperts: Bool? = nil
+
     private var needsModelReloadForStreamingChange: Bool {
-        effectiveStreamExpertsSetting != currentModelIsMoE
+        guard let applied = appliedStreamExperts else { return false }
+        return effectiveStreamExpertsSetting != applied
     }
 
     private var ssdStreamingBinding: Binding<Bool> {
@@ -126,6 +131,16 @@ struct SettingsView: View {
             }
             .onAppear {
                 draftServerConfiguration = server.startupConfiguration
+                // Seed the applied value from the current engine state so the reload
+                // prompt doesn't fire spuriously on first open.
+                if case .ready = engine.state {
+                    appliedStreamExperts = effectiveStreamExpertsSetting
+                }
+            }
+            .onChange(of: engine.state) { _, newState in
+                if case .ready = newState {
+                    appliedStreamExperts = effectiveStreamExpertsSetting
+                }
             }
             #if os(macOS)
             .frame(minWidth: 520, minHeight: 580)
@@ -953,27 +968,33 @@ struct SettingsView: View {
 
             switch engine.state {
             case .loading(let progress, let stage):
-                VStack(alignment: .leading, spacing: 4) {
-                    HStack {
-                        Text(stage)
-                            .font(.caption2.weight(.medium))
-                            .foregroundStyle(SwiftBuddyTheme.textSecondary)
-                        Spacer()
-                        Text("\(Int(progress * 100))%")
-                            .font(.caption2.monospacedDigit())
-                            .foregroundStyle(SwiftBuddyTheme.textTertiary)
-                    }
-
-                    ProgressView(value: progress)
-                        .tint(SwiftBuddyTheme.accent)
-                        .controlSize(.small)
-                }
+                progressRow(label: stage, progress: progress)
+            case .downloading(let progress, let speed):
+                progressRow(label: "Downloading · \(speed)", progress: progress)
             default:
                 EmptyView()
             }
         }
     }
 
+    @ViewBuilder
+    private func progressRow(label: String, progress: Double) -> some View {
+        VStack(alignment: .leading, spacing: 4) {
+            HStack {
+                Text(label)
+                    .font(.caption2.weight(.medium))
+                    .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                Spacer()
+                Text("\(Int(progress * 100))%")
+                    .font(.caption2.monospacedDigit())
+                    .foregroundStyle(SwiftBuddyTheme.textTertiary)
+            }
+            ProgressView(value: progress)
+                .tint(SwiftBuddyTheme.accent)
+                .controlSize(.small)
+        }
+    }
+
     private func reloadCurrentModel() {
         guard let currentModelId else { return }
         Task {
diff --git a/tests/SwiftLMTests/GenerationConfigPersistenceTests.swift b/tests/SwiftLMTests/GenerationConfigPersistenceTests.swift
index 97915313..72347448 100644
--- a/tests/SwiftLMTests/GenerationConfigPersistenceTests.swift
+++ b/tests/SwiftLMTests/GenerationConfigPersistenceTests.swift
@@ -131,8 +131,9 @@ final class GenerationConfigPersistenceTests: XCTestCase {
     func testGenerationConfig_Load_FallsBackToDefault_WhenNoStoredData() {
         // load() with no stored data must return .default, not crash.
         // We test this by ensuring no data is in a fresh suite.
-        let freshDefaults = UserDefaults(suiteName: "com.swiftlm.test.fresh.\(UUID().uuidString)")!
-        defer { freshDefaults.removePersistentDomain(forName: "com.swiftlm.test.fresh.\(UUID().uuidString)") }
+        let freshSuite = "com.swiftlm.test.fresh.\(UUID().uuidString)"
+        let freshDefaults = UserDefaults(suiteName: freshSuite)!
+        defer { freshDefaults.removePersistentDomain(forName: freshSuite) }
 
         // The static load() reads UserDefaults.standard, so we verify the
         // fallback contract by checking that .default is a valid config.
diff --git a/tests/SwiftLMTests/SwiftBuddyServerTests.swift b/tests/SwiftLMTests/SwiftBuddyServerTests.swift
index d1f3a4d8..d40c1080 100644
--- a/tests/SwiftLMTests/SwiftBuddyServerTests.swift
+++ b/tests/SwiftLMTests/SwiftBuddyServerTests.swift
@@ -50,13 +50,13 @@ final class SwiftBuddyServerTests: XCTestCase {
                        "Each model entry must have 'object': 'model'")
     }
 
-    func testModelsResponse_FallsBackToLocalWhenNoModelLoaded() throws {
-        // When no model is loaded, the handler returns "local" as the fallback.
-        // Clients must still receive a valid list structure.
+    func testModelsResponse_FallsBackToNoneWhenNoModelLoaded() throws {
+        // When no model is loaded, ServerManager returns "none" as the fallback ID
+        // (matching the /v1/models handler: `case .ready(let id): ... default: "none"`).
         let body: [String: Any] = [
             "object": "list",
             "data": [[
-                "id": "local",
+                "id": "none",
                 "object": "model",
                 "owned_by": "swiftbuddy"
             ]]
@@ -64,7 +64,8 @@ final class SwiftBuddyServerTests: XCTestCase {
         let data = try JSONSerialization.data(withJSONObject: body)
         let decoded = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any])
         let modelList = try XCTUnwrap(decoded["data"] as? [[String: Any]])
-        XCTAssertEqual(modelList[0]["id"] as? String, "local")
+        XCTAssertEqual(modelList[0]["id"] as? String, "none",
+                       "Fallback model ID must be 'none' — matches ServerManager /v1/models handler")
     }
 
     // ═══════════════════════════════════════════════════════════════════
@@ -75,6 +76,8 @@ final class SwiftBuddyServerTests: XCTestCase {
     // these guard the embedded server's specific encoding.
 
     /// Builds the SSE delta string the embedded server emits for each token.
+    /// NOTE: The embedded ServerManager does NOT include `role` in the delta object
+    /// (unlike the production Server.swift sseChunk helper which may include it).
     private func makeDeltaChunk(id: String, modelId: String, delta: String, finishReason: String? = nil) -> String {
         let finishReasonJSON = finishReason.map { "\"\($0)\"" } ?? "null"
         let escaped = delta
@@ -82,9 +85,7 @@ final class SwiftBuddyServerTests: XCTestCase {
             .replacingOccurrences(of: "\"", with: "\\\"")
             .replacingOccurrences(of: "\n", with: "\\n")
             .replacingOccurrences(of: "\r", with: "\\r")
-        return """
-        data: {"id":"\(id)","object":"chat.completion.chunk","model":"\(modelId)","choices":[{"index":0,"delta":{"role":"assistant","content":"\(escaped)"},"finish_reason":\(finishReasonJSON)}]}\r\n\r\n
-        """
+        return "data: {\"id\":\"\(id)\",\"object\":\"chat.completion.chunk\",\"model\":\"\(modelId)\",\"choices\":[{\"index\":0,\"delta\":{\"content\":\"\(escaped)\"},\"finish_reason\":\(finishReasonJSON)}]}\r\n\r\n"
     }
 
     func testSSEDeltaChunk_HasCorrectPrefix() {
@@ -112,7 +113,8 @@ final class SwiftBuddyServerTests: XCTestCase {
 
         let delta = try XCTUnwrap(choices[0]["delta"] as? [String: Any])
         XCTAssertEqual(delta["content"] as? String, "Hi!")
-        XCTAssertEqual(delta["role"] as? String, "assistant")
+        // The embedded server does not include "role" in streaming delta objects
+        XCTAssertNil(delta["role"], "Embedded server delta must NOT include 'role' — only content")
     }
 
     func testSSEDeltaChunk_EscapesSpecialCharacters() throws {