diff --git a/Package.swift b/Package.swift
index 42bccb6..9286564 100644
--- a/Package.swift
+++ b/Package.swift
@@ -117,7 +117,7 @@ let package = Package(
         ),
         .testTarget(
             name: "SwiftLMTests",
-            dependencies: ["SwiftLM"]
+            dependencies: ["SwiftLM", "MLXInferenceCore"]
         )
     ]
 )
diff --git a/Sources/MLXInferenceCore/CLICommandBuilder.swift b/Sources/MLXInferenceCore/CLICommandBuilder.swift
new file mode 100644
index 0000000..833aaf3
--- /dev/null
+++ b/Sources/MLXInferenceCore/CLICommandBuilder.swift
@@ -0,0 +1,68 @@
+// CLICommandBuilder.swift — Pure function for building the equivalent CLI command
+// Lives in MLXInferenceCore so it can be unit-tested by SwiftLMTests without
+// requiring the SwiftBuddy app target.
+import Foundation
+
+/// Builds the equivalent `swift run SwiftLM` command string from persisted settings.
+/// Only emits flags that differ from the CLI defaults, keeping the command readable.
+///
+/// - Parameters:
+///   - config:  The current `GenerationConfig`.
+///   - host:    The server host string (e.g. "127.0.0.1").
+///   - port:    The server port (e.g. 5413).
+///   - parallel: Number of parallel request slots (default 1).
+///   - apiKeySet: `true` if an API key is configured (key itself is redacted).
+///   - modelId:  The currently loaded model ID, or `nil` when no model is loaded.
+/// - Returns: A multi-line shell command string suitable for display and copy.
+public func buildCLICommand(
+    config: GenerationConfig,
+    host: String,
+    port: Int,
+    parallel: Int,
+    apiKeySet: Bool,
+    modelId: String?
+) -> String {
+    var parts: [String] = []
+
+    parts.append("--model \(modelId ?? "<model-id>")")
+    parts.append("--host \(host)")
+    parts.append("--port \(port)")
+    parts.append("--max-tokens \(config.maxTokens)")
+    parts.append("--temp \(String(format: "%.2f", config.temperature))")
+
+    if config.topP < 1.0 {
+        parts.append("--top-p \(String(format: "%.2f", config.topP))")
+    }
+    if config.topK != 50 {
+        parts.append("--top-k \(config.topK)")
+    }
+    if config.minP > 0 {
+        parts.append("--min-p \(String(format: "%.2f", config.minP))")
+    }
+    if config.repetitionPenalty != 1.05 {
+        parts.append("--repeat-penalty \(String(format: "%.2f", config.repetitionPenalty))")
+    }
+    if config.prefillSize != 512 {
+        parts.append("--prefill-size \(config.prefillSize)")
+    }
+    if let kvBits = config.kvBits {
+        parts.append("--kv-bits \(kvBits)")
+        if config.kvGroupSize != 64 {
+            parts.append("--kv-group-size \(config.kvGroupSize)")
+        }
+    }
+    if config.enableThinking {
+        parts.append("--thinking")
+    }
+    if let seed = config.seed {
+        parts.append("--seed \(seed)")
+    }
+    if parallel > 1 {
+        parts.append("--parallel \(parallel)")
+    }
+    if apiKeySet {
+        parts.append("--api-key <redacted>")
+    }
+
+    return "swift run SwiftLM " + parts.joined(separator: " \\\n  ")
+}
diff --git a/Sources/MLXInferenceCore/GenerationConfig.swift b/Sources/MLXInferenceCore/GenerationConfig.swift
index 9ec4186..97c77a0 100644
--- a/Sources/MLXInferenceCore/GenerationConfig.swift
+++ b/Sources/MLXInferenceCore/GenerationConfig.swift
@@ -1,24 +1,29 @@
 // GenerationConfig.swift — SwiftLM inference parameters
 import Foundation
 
-/// Configuration for a single generation request.
-public struct GenerationConfig: Sendable {
+/// Per-request generation parameters, persisted across app launches via UserDefaults.
+///
+/// ### Field classification
+/// **Per-request** (applied on every `generate()` call — no reload needed):
+///   temperature, topP, topK, minP, repetitionPenalty, seed, enableThinking,
+///   prefillSize, kvBits, kvGroupSize, turboKV
+///
+/// **Load-time** (requires model reload to take effect):
+///   streamExperts — controls SSD expert streaming for MoE and large models.
+///   Stored here for persistence but applied by InferenceEngine at load time.
+public struct GenerationConfig: Sendable, Codable {
     public var maxTokens: Int
     public var temperature: Float
     public var topP: Float
     public var topK: Int
     public var minP: Float
     public var repetitionPenalty: Float
-    public var seed: UInt64?
-    public var enableThinking: Bool
 
-    // ── SwiftLM Engine Parameters ──────────────────────────────────────
-    /// Enable TurboQuant KV-cache compression (3-bit PolarQuant+QJL).
-    /// Compresses KV history > 8192 tokens to ~3.5 bits/token.
-    public var turboKV: Bool
+    /// Optional RNG seed for reproducible outputs.
+    /// When non-nil, `MLX.seed(seed)` is called before each generation using this `UInt64` value.
+    public var seed: UInt64?
 
-    /// Enable SSD expert streaming for MoE models.
-    public var streamExperts: Bool
+    public var enableThinking: Bool
 
     /// Chunk size for prefill evaluation.
     /// Lower values prevent GPU timeout on large models.
@@ -30,6 +35,21 @@ public struct GenerationConfig: Sendable {
     /// KV-cache quantization group size (default 64).
     public var kvGroupSize: Int
 
+    /// Enable 3-bit TurboQuant KV-cache compression (PolarQuant+QJL).
+    /// Compresses KV history older than 8192 tokens to ~3.5 bits/token.
+    /// Recommended for 100k+ context to halve KV RAM usage.
+    /// Applied per-request — no model reload needed.
+    public var turboKV: Bool
+
+    /// Enable SSD expert streaming for MoE (and any large) models.
+    /// When true, expert weights are mmap'd from NVMe and only active
+    /// expert pages reside in RAM during inference (Flash-MoE style).
+    /// ⚠️ LOAD-TIME flag: changes take effect on the next model load.
+    /// MoE models (isMoE == true) default to true automatically;
+    /// this flag lets users override that for non-catalog models or
+    /// force-disable streaming even on MoE models.
+    public var streamExperts: Bool
+
     public init(
         maxTokens: Int = 2048,
         temperature: Float = 0.6,
@@ -39,11 +59,11 @@ public struct GenerationConfig: Sendable {
         repetitionPenalty: Float = 1.05,
         seed: UInt64? = nil,
         enableThinking: Bool = false,
-        turboKV: Bool = false,
-        streamExperts: Bool = false,
         prefillSize: Int = 512,
         kvBits: Int? = nil,
-        kvGroupSize: Int = 64
+        kvGroupSize: Int = 64,
+        turboKV: Bool = false,
+        streamExperts: Bool = false
     ) {
         self.maxTokens = maxTokens
         self.temperature = temperature
@@ -53,12 +73,41 @@ public struct GenerationConfig: Sendable {
         self.repetitionPenalty = repetitionPenalty
         self.seed = seed
         self.enableThinking = enableThinking
-        self.turboKV = turboKV
-        self.streamExperts = streamExperts
         self.prefillSize = prefillSize
         self.kvBits = kvBits
         self.kvGroupSize = kvGroupSize
+        self.turboKV = turboKV
+        self.streamExperts = streamExperts
     }
 
     public static let `default` = GenerationConfig()
+
+    // MARK: — Persistence
+
+    private static let storageKey = "swiftlm.generationConfig"
+
+    /// True when the user has previously saved a GenerationConfig.
+    /// Used to distinguish the first-run/default state from an explicit choice.
+    public static var hasPersistedConfig: Bool {
+        UserDefaults.standard.object(forKey: storageKey) != nil
+    }
+
+    /// Computes the effective SSD streaming setting.
+    /// Before the user has saved settings, MoE models default to streaming on.
+    /// After settings are persisted, the saved toggle becomes authoritative.
+    public func effectiveStreamExperts(defaultingTo defaultValue: Bool) -> Bool {
+        Self.hasPersistedConfig ? streamExperts : defaultValue
+    }
+
+    public func save() {
+        guard let data = try? JSONEncoder().encode(self) else { return }
+        UserDefaults.standard.set(data, forKey: Self.storageKey)
+    }
+
+    public static func load() -> GenerationConfig {
+        guard let data = UserDefaults.standard.data(forKey: storageKey),
+              let decoded = try? JSONDecoder().decode(GenerationConfig.self, from: data)
+        else { return .default }
+        return decoded
+    }
 }
diff --git a/Sources/MLXInferenceCore/InferenceEngine.swift b/Sources/MLXInferenceCore/InferenceEngine.swift
index 38d5b39..28eb225 100644
--- a/Sources/MLXInferenceCore/InferenceEngine.swift
+++ b/Sources/MLXInferenceCore/InferenceEngine.swift
@@ -72,7 +72,7 @@ private struct TransformersTokenizerBridge: MLXLMCommon.Tokenizer, Sendable {
 public enum ModelState: Equatable, Sendable {
     case idle
     case downloading(progress: Double, speed: String)
-    case loading
+    case loading(progress: Double, stage: String)
     case ready(modelId: String)
     case generating
     case error(String)
@@ -319,7 +319,7 @@ public final class InferenceEngine: ObservableObject {
     }
 
     private func loadVerifiedModel(modelId: String) async {
-        state = .loading
+        setLoadingState(progress: 0.05, stage: "Preparing model configuration")
         currentModelId = modelId
 
         do {
@@ -331,10 +331,13 @@ public final class InferenceEngine: ObservableObject {
             // at load time — only active expert pages touch RAM during inference.
             var config = ModelConfiguration(id: modelId)
             let isMoE = ModelCatalog.all.first(where: { $0.id == modelId })?.isMoE ?? false
-            if isMoE {
+            let generationConfig = GenerationConfig.load()
+            // SSD expert streaming defaults ON for MoE until the user saves a preference.
+            // Once persisted, the saved toggle becomes authoritative for all models.
+            let shouldStream = generationConfig.effectiveStreamExperts(defaultingTo: isMoE)
+            if shouldStream {
                 config.lazyLoad = true
                 let modelDir = ModelStorage.snapshotDirectory(for: modelId)
-                // directIO=true on macOS (5 GB/s NVMe pread), false on iOS (mmap fallback)
                 ExpertStreamingConfig.shared.activate(
                     modelDirectory: modelDir,
                     useDirectIO: {
@@ -345,15 +348,23 @@ public final class InferenceEngine: ObservableObject {
                         #endif
                     }()
                 )
+                print("[InferenceEngine] SSD expert streaming: enabled (isMoE=\(isMoE), persisted=\(GenerationConfig.hasPersistedConfig), setting=\(generationConfig.streamExperts))")
+            } else {
+                print("[InferenceEngine] SSD expert streaming: disabled")
             }
 
+            setLoadingState(progress: 0.15, stage: "Inspecting model architecture")
             let downloader = HubDownloader(hub: hub)
             let architecture = try await ModelArchitectureProbe.inspect(
                 configuration: config,
                 downloader: downloader
             )
 
-            let speedTracker = DownloadSpeedTracker()
+            let loadingStage = architecture.supportsVision
+                ? "Loading multimodal model"
+                : "Loading language model"
+
+            setLoadingState(progress: 0.22, stage: loadingStage)
 
             if architecture.supportsVision {
                 container = try await VLMModelFactory.shared.loadContainer(
@@ -361,22 +372,10 @@ public final class InferenceEngine: ObservableObject {
                     using: TransformersTokenizerLoader(),
                     configuration: config
                 ) { [weak self] progress in
-                    speedTracker.record(totalBytes: progress.completedUnitCount)
-                    let smoothedSpeed = speedTracker.speedBytesPerSec
-
                     Task { @MainActor in
                         guard let self else { return }
                         let pct = progress.fractionCompleted
-                        let speedStr = smoothedSpeed
-                            .map { String(format: "%.1f MB/s", $0 / 1_000_000) } ?? ""
-                        self.state = .downloading(progress: pct, speed: speedStr)
-
-                        self.downloadManager.updateProgress(ModelDownloadProgress(
-                            modelId: modelId,
-                            fractionCompleted: pct,
-                            currentFile: "",
-                            speedMBps: smoothedSpeed.map { $0 / 1_000_000 }
-                        ))
+                        self.setLoadingState(progress: 0.22 + (pct * 0.68), stage: loadingStage)
                     }
                 }
             } else {
@@ -385,22 +384,10 @@ public final class InferenceEngine: ObservableObject {
                     using: TransformersTokenizerLoader(),
                     configuration: config
                 ) { [weak self] progress in
-                    speedTracker.record(totalBytes: progress.completedUnitCount)
-                    let smoothedSpeed = speedTracker.speedBytesPerSec
-
                     Task { @MainActor in
                         guard let self else { return }
                         let pct = progress.fractionCompleted
-                        let speedStr = smoothedSpeed
-                            .map { String(format: "%.1f MB/s", $0 / 1_000_000) } ?? ""
-                        self.state = .downloading(progress: pct, speed: speedStr)
-
-                        self.downloadManager.updateProgress(ModelDownloadProgress(
-                            modelId: modelId,
-                            fractionCompleted: pct,
-                            currentFile: "",
-                            speedMBps: smoothedSpeed.map { $0 / 1_000_000 }
-                        ))
+                        self.setLoadingState(progress: 0.22 + (pct * 0.68), stage: loadingStage)
                     }
                 }
             }
@@ -410,11 +397,13 @@ public final class InferenceEngine: ObservableObject {
             downloadManager.refresh()
 
             // Verify integrity to catch incomplete downloads before marking as ready
+            setLoadingState(progress: 0.94, stage: "Verifying model files")
             guard ModelStorage.verifyModelIntegrity(for: modelId) else {
                 throw NSError(domain: "InferenceEngine", code: 1, userInfo: [NSLocalizedDescriptionKey: "Model safetensors files are incomplete. Please delete and re-download."])
             }
 
             // Read the model's actual max context length from config.json
+            setLoadingState(progress: 0.98, stage: "Reading model limits")
             if let ctxLen = ModelStorage.readMaxContextLength(for: modelId) {
                 self.maxContextWindow = ctxLen
                 print("[InferenceEngine] Model context window: \(ctxLen) tokens")
@@ -464,6 +453,10 @@ public final class InferenceEngine: ObservableObject {
         MLX.Memory.cacheLimit = 0
     }
 
+    private func setLoadingState(progress: Double, stage: String) {
+        state = .loading(progress: min(max(progress, 0), 1), stage: stage)
+    }
+
     private func markModelCorrupted(modelId: String?, message: String) {
         let failedModelId = modelId ?? currentModelId
         releaseLoadedModelResources()
@@ -488,6 +481,44 @@ public final class InferenceEngine: ObservableObject {
     }
 
     // MARK: — Generation
+}
+
+// MARK: — Helpers
+
+/// Removes all `<think>…</think>` spans from `text`, including the closing tag's
+/// trailing newline when present.  Used to sanitise assistant history messages
+/// before they are re-submitted to the Jinja chat-template renderer on subsequent
+/// turns — Qwen3 (and similar "thinking" models) raise TemplateException error 1
+/// when prior assistant turns contain raw thinking tags.
+///
+/// Trimming is applied only when at least one tag span was actually removed so
+/// that assistant messages without thinking content are returned byte-for-byte
+/// (preserving leading spaces, code-block indentation, etc.).
+func stripThinkingTags(from text: String) -> String {
+    var result = text
+    var stripped = false
+    while let openRange = result.range(of: "<think>") {
+        stripped = true
+        if let closeRange = result.range(of: "</think>", range: openRange.lowerBound..<result.endIndex) {
+            // Include the optional newline that immediately follows </think>
+            var endIdx = closeRange.upperBound
+            if endIdx < result.endIndex && result[endIdx] == "\n" {
+                endIdx = result.index(after: endIdx)
+            }
+            result.removeSubrange(openRange.lowerBound..<endIdx)
+        } else {
+            // Unclosed <think> — strip from opening tag to end of string
+            result.removeSubrange(openRange.lowerBound...)
+            break
+        }
+    }
+    // Only trim surrounding whitespace that was introduced by stripping;
+    // leave untouched messages that contained no think tags.
+    return stripped ? result.trimmingCharacters(in: .whitespacesAndNewlines) : result
+}
+
+extension InferenceEngine {
+    // MARK: — Generation (continued)
 
     public nonisolated func generate(
         messages: [ChatMessage],
@@ -515,10 +546,21 @@ public final class InferenceEngine: ObservableObject {
                         if msg.role == .system {
                             pendingSystemContext += msg.content + "\n\n"
                         } else {
-                            var roleRaw = msg.role.rawValue
-                            if roleRaw == "assistant" { roleRaw = "model" }
+                            // Use the canonical role name — Qwen3 (and most models) use
+                            // "assistant", not "model". The "model" alias is Gemma-specific
+                            // and breaks Qwen3's Jinja chat template on multi-turn history.
+                            let roleRaw = msg.role.rawValue  // "user" | "assistant" | "tool"
                             var content = msg.content
                             
+                            // Strip <think>…</think> blocks from prior assistant turns.
+                            // If the model generated thinking content on a previous turn and
+                            // it was not already split into thinkingContent, the raw tags will
+                            // be present in `content`. Feeding them back into the Jinja template
+                            // on the next request causes TemplateException error 1 on Qwen3.
+                            if msg.role == .assistant {
+                                content = stripThinkingTags(from: content)
+                            }
+                            
                             if roleRaw == "user" && !pendingSystemContext.isEmpty {
                                 content = "[SYSTEM CONTEXT / PERSONA DATA]\n" + pendingSystemContext + "\n[END CONTEXT]\n\n" + content
                                 pendingSystemContext = "" // Clear after injecting
@@ -545,17 +587,46 @@ public final class InferenceEngine: ObservableObject {
                     var outputText = ""
                     var tokenCount = 0
 
-                    let userInput = UserInput(messages: mlxMessages)
+                    // Set RNG seed for reproducible output when requested.
+                    if let seed = config.seed {
+                        MLX.seed(seed)
+                    }
+
+                    // Pass enable_thinking to the Jinja chat template so the model
+                    // actually generates <think> blocks when thinking mode is ON.
+                    // Without this kwarg, Qwen3's template defaults to thinking=false
+                    // regardless of what the UI toggle shows.
+                    let additionalContext: [String: any Sendable]? = config.enableThinking
+                        ? ["enable_thinking": true]
+                        : ["enable_thinking": false]
+                    let userInput = UserInput(
+                        messages: mlxMessages,
+                        additionalContext: additionalContext
+                    )
                     let lmInput = try await container.prepare(input: userInput)
                     
-                    // Approximate the input token size (as LMInput wrapper blocks direct inspection without private API)
-                    // MLX often counts 1 word roughly as 1.3 tokens. 
-                    let stringLength = mlxMessages.map { ($0["content"] ?? "").count }.reduce(0, +)
-                    let baseTokens = Int(Double(stringLength) / 3.5)
+                    // Use the real token count from the prepared LMInput rather than
+                    // a character-length heuristic (which was consistently off by 2–3×
+                    // for CJK and code content).
+                    let baseTokens = lmInput.text.tokens.size
                     self.activeContextTokens = baseTokens
                     
                     // maxContextWindow is already set during loadModel() from config.json
-                    
+
+                    // TurboKV: enable 3-bit PolarQuant+QJL on every KVCacheSimple layer
+                    // before generation. Must be set on the model (not the cache) so the
+                    // cache inherits the flag when newCache() is called inside generate().
+                    if config.turboKV {
+                        await container.perform { ctx in
+                            for module in ctx.model.modules() {
+                                if let simple = module as? KVCacheSimple {
+                                    simple.turboQuantEnabled = true
+                                }
+                            }
+                        }
+                        print("[InferenceEngine] TurboKV enabled for this request")
+                    }
+
                     let stream: AsyncStream<Generation> = try await container.generate(
                         input: lmInput,
                         parameters: params
diff --git a/SwiftBuddy/SwiftBuddy/ViewModels/ChatViewModel.swift b/SwiftBuddy/SwiftBuddy/ViewModels/ChatViewModel.swift
index 5fcf1f6..60542c9 100644
--- a/SwiftBuddy/SwiftBuddy/ViewModels/ChatViewModel.swift
+++ b/SwiftBuddy/SwiftBuddy/ViewModels/ChatViewModel.swift
@@ -12,14 +12,43 @@ final class ChatViewModel: ObservableObject {
     @Published var streamingText: String = ""
     @Published var thinkingText: String? = nil
     @Published var isGenerating: Bool = false
-    @Published var config: GenerationConfig = .default
-    @Published var systemPrompt: String = ""
+    @Published var config: GenerationConfig = .load() {
+        didSet { scheduleConfigSave() }
+    }
+    @Published var systemPrompt: String = {
+        UserDefaults.standard.string(forKey: "swiftlm.systemPrompt") ?? ""
+    }() {
+        didSet { scheduleSystemPromptSave() }
+    }
     public var currentWing: String? = nil
     weak var engine: InferenceEngine?
     var modelContext: ModelContext?
     private var generationTask: Task<Void, Never>?
     private var activeSession: ChatSession?
 
+    // MARK: — Debounced persistence
+    // Saves are debounced at 0.5 s so rapid slider drags or keystrokes
+    // don't saturate UserDefaults with synchronous writes and cause UI jank.
+    private var configSaveWork: DispatchWorkItem?
+    private var systemPromptSaveWork: DispatchWorkItem?
+
+    private func scheduleConfigSave() {
+        configSaveWork?.cancel()
+        let work = DispatchWorkItem { [weak self] in self?.config.save() }
+        configSaveWork = work
+        DispatchQueue.main.asyncAfter(deadline: .now() + 0.5, execute: work)
+    }
+
+    private func scheduleSystemPromptSave() {
+        systemPromptSaveWork?.cancel()
+        let snapshot = systemPrompt
+        let work = DispatchWorkItem {
+            UserDefaults.standard.set(snapshot, forKey: "swiftlm.systemPrompt")
+        }
+        systemPromptSaveWork = work
+        DispatchQueue.main.asyncAfter(deadline: .now() + 0.5, execute: work)
+    }
+
     // MARK: — Send
 
     func send(_ userText: String) async {
diff --git a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
index c76c917..3455304 100644
--- a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
+++ b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
@@ -150,10 +150,108 @@ final class ServerManager: ObservableObject {
                     return Response(status: .ok, headers: swiftBuddyJSONHeaders, body: .init(byteBuffer: buffer))
                 }
 
-                // Simple V1 models mock
+                // ── /v1/models ─────────────────────────────────────────
                 router.get("/v1/models") { _, _ -> Response in
-                    let buffer = ByteBuffer(string: #"{"object": "list", "data": [{"id": "local", "object": "model"}]}"#)
-                    return Response(status: .ok, headers: swiftBuddyJSONHeaders, body: .init(byteBuffer: buffer))
+                    let modelId: String
+                    switch await engine.state {
+                    case .ready(let id): modelId = id
+                    default: modelId = "none"
+                    }
+                    // Use swiftBuddyJSONString to safely escape the model ID —
+                    // model IDs with slashes (e.g. "mlx-community/Qwen3") are safe,
+                    // but quotes or control chars would break the JSON structure.
+                    let body = "{\"object\":\"list\",\"data\":[{\"id\":\(swiftBuddyJSONString(modelId)),\"object\":\"model\",\"owned_by\":\"swiftbuddy\"}]}"
+                    return Response(status: .ok, headers: swiftBuddyJSONHeaders,
+                                    body: .init(byteBuffer: ByteBuffer(string: body)))
+                }
+
+                // ── /v1/chat/completions (OpenAI-compatible, streaming + non-streaming) ──
+                router.post("/v1/chat/completions") { request, _ -> Response in
+                    // 1. Parse body
+                    guard let bodyData = try? await request.body.collect(upTo: 4 * 1024 * 1024),
+                          let json = try? JSONSerialization.jsonObject(with: Data(buffer: bodyData)) as? [String: Any]
+                    else {
+                        let err = #"{"error":{"message":"Invalid JSON body","type":"invalid_request_error"}}"#
+                        return Response(status: .badRequest, headers: swiftBuddyJSONHeaders,
+                                        body: .init(byteBuffer: ByteBuffer(string: err)))
+                    }
+
+                    let streamRequested = json["stream"] as? Bool ?? false
+
+                    // 2. Map messages
+                    var chatMessages: [ChatMessage] = []
+                    if let msgs = json["messages"] as? [[String: Any]] {
+                        for m in msgs {
+                            let role    = m["role"]    as? String ?? "user"
+                            let content = m["content"] as? String ?? ""
+                            switch role {
+                            case "system", "developer": chatMessages.append(.system(content))
+                            case "assistant":           chatMessages.append(.assistant(content))
+                            case "tool":                chatMessages.append(.tool(content))
+                            case "user":                chatMessages.append(.user(content))
+                            default:                    chatMessages.append(.user(content))
+                            }
+                        }
+                    }
+
+                    // 3. Build request config from persisted user defaults + per-request overrides
+                    var reqConfig = GenerationConfig.load()
+                    if let t  = json["temperature"]       as? Double { reqConfig.temperature        = Float(t) }
+                    if let p  = json["top_p"]             as? Double { reqConfig.topP               = Float(p) }
+                    if let mt = json["max_tokens"]        as? Int    { reqConfig.maxTokens           = mt }
+                    if let rp = json["frequency_penalty"] as? Double { reqConfig.repetitionPenalty  = Float(rp) }
+
+                    let modelId: String
+                    switch await engine.state {
+                    case .ready(let id): modelId = id
+                    default: modelId = "local"
+                    }
+                    let reqId   = "chatcmpl-\(UUID().uuidString.prefix(8))"
+                    let created = Int(Date().timeIntervalSince1970)
+                    // Escape model ID once — used in both streaming and non-streaming paths.
+                    // Slashes in HF model IDs (e.g. "mlx-community/Qwen3") are safe inside
+                    // JSON strings, but quotes/control chars in custom model names would break.
+                    let escapedModelId = swiftBuddyJSONString(modelId)
+
+                    // Helper: JSON-safe escape for token text using JSONEncoder so ALL
+                    // control chars (U+0000–U+001F) are correctly escaped, not just \n/\r/\t.
+                    func jsonEscape(_ s: String) -> String {
+                        guard let data = try? JSONEncoder().encode(s),
+                              let raw = String(data: data, encoding: .utf8) else { return "\"\"" }
+                        // JSONEncoder wraps in outer quotes — strip them for inline interpolation
+                        return String(raw.dropFirst().dropLast())
+                    }
+
+                    if streamRequested {
+                        // ── SSE streaming ───────────────────────────────────
+                        var sseHeaders = HTTPFields()
+                        sseHeaders.append(HTTPField(name: .contentType, value: "text/event-stream; charset=utf-8"))
+                        sseHeaders.append(HTTPField(name: HTTPField.Name("Cache-Control")!, value: "no-cache"))
+                        sseHeaders.append(HTTPField(name: HTTPField.Name("X-Accel-Buffering")!, value: "no"))
+
+                        let sseStream = AsyncStream<ByteBuffer> { cont in
+                            Task {
+                                for await token in await engine.generate(messages: chatMessages, config: reqConfig) {
+                                    let chunk = "{\"id\":\"\(reqId)\",\"object\":\"chat.completion.chunk\",\"created\":\(created),\"model\":\(escapedModelId),\"choices\":[{\"index\":0,\"delta\":{\"content\":\"\(jsonEscape(token.text))\"},\"finish_reason\":null}]}"
+                                    cont.yield(ByteBuffer(string: "data: \(chunk)\n\n"))
+                                }
+                                cont.yield(ByteBuffer(string: "data: [DONE]\n\n"))
+                                cont.finish()
+                            }
+                        }
+                        return Response(status: .ok, headers: sseHeaders,
+                                        body: .init(asyncSequence: sseStream))
+
+                    } else {
+                        // ── Non-streaming: collect full response ────────────
+                        var fullText = ""
+                        for await token in await engine.generate(messages: chatMessages, config: reqConfig) {
+                            fullText += token.text
+                        }
+                        let body = "{\"id\":\"\(reqId)\",\"object\":\"chat.completion\",\"created\":\(created),\"model\":\(escapedModelId),\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"\(jsonEscape(fullText))\"},\"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":0,\"completion_tokens\":0,\"total_tokens\":0}}"
+                        return Response(status: .ok, headers: swiftBuddyJSONHeaders,
+                                        body: .init(byteBuffer: ByteBuffer(string: body)))
+                    }
                 }
                 
                 let app = Application(
diff --git a/SwiftBuddy/SwiftBuddy/Views/ChatView.swift b/SwiftBuddy/SwiftBuddy/Views/ChatView.swift
index 8bded67..831fe81 100644
--- a/SwiftBuddy/SwiftBuddy/Views/ChatView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/ChatView.swift
@@ -137,19 +137,28 @@ struct ChatView: View {
         case .downloading(let progress, let speed):
             DownloadAnimationView(progress: progress, speed: speed)
 
-        case .loading:
+        case .loading(let progress, let stage):
             VStack(spacing: 16) {
                 ZStack {
                     Circle()
                         .stroke(SwiftBuddyTheme.accent.opacity(0.15), lineWidth: 3)
                         .frame(width: 64, height: 64)
-                    ProgressView()
+                    ProgressView(value: progress)
                         .controlSize(.large)
                         .tint(SwiftBuddyTheme.accent)
+                        .frame(width: 64)
+                }
+                VStack(spacing: 4) {
+                    Text("Loading model into Metal GPU…")
+                        .font(.subheadline)
+                        .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                    Text(stage)
+                        .font(.caption)
+                        .foregroundStyle(SwiftBuddyTheme.textTertiary)
+                    Text("\(Int(progress * 100))%")
+                        .font(.caption.monospacedDigit())
+                        .foregroundStyle(SwiftBuddyTheme.textTertiary)
                 }
-                Text("Loading model into Metal GPU…")
-                    .font(.subheadline)
-                    .foregroundStyle(SwiftBuddyTheme.textSecondary)
             }
 
         case .idle:
@@ -252,13 +261,18 @@ struct ChatView: View {
         switch engine.state {
         case .idle:
             bannerRow(icon: "cpu", text: "No model loaded", color: SwiftBuddyTheme.textTertiary)
-        case .loading:
-            HStack(spacing: 8) {
-                ProgressView().controlSize(.mini).tint(SwiftBuddyTheme.accent)
-                Text("Loading model…")
-                    .font(.caption)
-                    .foregroundStyle(SwiftBuddyTheme.textSecondary)
-                Spacer()
+        case .loading(let progress, let stage):
+            VStack(alignment: .leading, spacing: 4) {
+                HStack {
+                    Text(stage)
+                        .font(.caption.weight(.medium))
+                        .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                    Spacer()
+                    Text("\(Int(progress * 100))%")
+                        .font(.caption2.monospacedDigit())
+                        .foregroundStyle(SwiftBuddyTheme.textTertiary)
+                }
+                ProgressView(value: progress).tint(SwiftBuddyTheme.accent)
             }
             .padding(.horizontal, 16)
             .padding(.vertical, 8)
@@ -527,7 +541,7 @@ extension ModelState {
     var shortLabel: String {
         switch self {
         case .idle:                        return "No model"
-        case .loading:                     return "Loading…"
+        case .loading(let progress, _):    return "\(Int(progress * 100))% loading"
         case .downloading(let p, _):       return "\(Int(p * 100))% downloading"
         case .ready(let modelId):          return modelId.components(separatedBy: "/").last ?? modelId
         case .generating:                  return "Generating"
diff --git a/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift b/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift
index ac0cfc1..ad96882 100644
--- a/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift
@@ -277,8 +277,8 @@ private struct ActiveModelCardView: View {
                     entry: engine.loadedModelId.flatMap { id in ModelCatalog.all.first(where: { $0.id == id }) },
                     state: engine.state
                 )
-            case .loading:
-                loadingCard
+            case .loading(let progress, let stage):
+                loadingCard(progress: progress, stage: stage)
             case .downloading(let progress, let speed):
                 downloadingCard(progress: progress, speed: speed)
             case .idle, .error:
@@ -287,18 +287,24 @@ private struct ActiveModelCardView: View {
         }
     }
 
-    private var loadingCard: some View {
-        HStack(spacing: 12) {
-            ProgressView().controlSize(.regular).tint(SwiftBuddyTheme.accent)
-            VStack(alignment: .leading, spacing: 2) {
-                Text("Loading model…")
-                    .font(.subheadline.weight(.semibold))
-                    .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                Text("Initializing Metal GPU")
-                    .font(.caption)
+    private func loadingCard(progress: Double, stage: String) -> some View {
+        VStack(alignment: .leading, spacing: 10) {
+            HStack {
+                ProgressView().controlSize(.regular).tint(SwiftBuddyTheme.accent)
+                VStack(alignment: .leading, spacing: 2) {
+                    Text("Loading model…")
+                        .font(.subheadline.weight(.semibold))
+                        .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                    Text(stage)
+                        .font(.caption)
+                        .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                }
+                Spacer()
+                Text("\(Int(progress * 100))%")
+                    .font(.caption.monospacedDigit())
                     .foregroundStyle(SwiftBuddyTheme.textSecondary)
             }
-            Spacer()
+            ProgressView(value: progress).tint(SwiftBuddyTheme.accent)
         }
         .padding()
         .glassCard(cornerRadius: SwiftBuddyTheme.radiusLarge)
diff --git a/SwiftBuddy/SwiftBuddy/Views/RootView.swift b/SwiftBuddy/SwiftBuddy/Views/RootView.swift
index 049e4b3..efa301a 100644
--- a/SwiftBuddy/SwiftBuddy/Views/RootView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/RootView.swift
@@ -26,6 +26,7 @@ struct RootView: View {
     @State private var showTextIngestion = false
     @State private var showModelManagement = false
     @State private var lastDownloadLogBucket: Int?
+    @State private var lastLoadingStage: String?
     enum Tab { case chat, models, palace, mindPalace, miner, settings }
 
     var body: some View {
@@ -72,11 +73,16 @@ struct RootView: View {
                     switch newState {
                     case .idle:
                         lastDownloadLogBucket = nil
+                        lastLoadingStage = nil
                         ConsoleLog.shared.info("Engine idle — no model loaded")
-                    case .loading:
+                    case .loading(_, let stage):
                         lastDownloadLogBucket = nil
-                        ConsoleLog.shared.info("Loading model…")
+                        if lastLoadingStage != stage {
+                            lastLoadingStage = stage
+                            ConsoleLog.shared.info(stage)
+                        }
                     case .downloading(let p, let speed):
+                        lastLoadingStage = nil
                         let percent = Int(p * 100)
                         let bucket = min((percent / 25) * 25, 100)
                         if bucket != lastDownloadLogBucket, [0, 25, 50, 75, 100].contains(bucket) {
@@ -85,12 +91,15 @@ struct RootView: View {
                         }
                     case .ready(let modelId):
                         lastDownloadLogBucket = nil
+                        lastLoadingStage = nil
                         ConsoleLog.shared.info("✓ Model ready: \(modelId)")
                     case .generating:
                         lastDownloadLogBucket = nil
+                        lastLoadingStage = nil
                         ConsoleLog.shared.debug("Generating…")
                     case .error(let msg):
                         lastDownloadLogBucket = nil
+                        lastLoadingStage = nil
                         ConsoleLog.shared.error("Engine error: \(msg)")
                     }
                 }
@@ -430,12 +439,12 @@ struct RootView: View {
                 .tint(SwiftBuddyTheme.accent)
                 .controlSize(.small)
 
-        case .loading:
-            HStack(spacing: 6) {
-                ProgressView().controlSize(.mini).tint(SwiftBuddyTheme.accent)
-                Text("Loading…")
-                    .font(.caption)
-                    .foregroundStyle(SwiftBuddyTheme.textSecondary)
+        case .loading(let progress, let stage):
+            VStack(alignment: .leading, spacing: 4) {
+                ProgressView(value: progress).tint(SwiftBuddyTheme.accent)
+                Text("\(Int(progress * 100))% · \(stage)")
+                    .font(.caption2.monospacedDigit())
+                    .foregroundStyle(SwiftBuddyTheme.textTertiary)
             }
 
         case .downloading(let progress, let speed):
diff --git a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
index 9f6f5e7..e54bad2 100644
--- a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
@@ -18,6 +18,10 @@ struct SettingsView: View {
     @State private var selectedTab: SettingsTab = .generation
     @State private var draftServerConfiguration = ServerStartupConfiguration.load()
     @State private var showRestartNotification = false
+    @State private var endpointCopied = false
+    @State private var showAppliedBadge = false
+    @State private var toastHideWork: DispatchWorkItem? = nil
+    @State private var cliCopied = false
     @State private var serverSaveMessage = "Server settings saved"
     @State private var restartNotificationRequiresAction = false
 
@@ -28,18 +32,36 @@ struct SettingsView: View {
         Double(ProcessInfo.processInfo.physicalMemory) / (1024 * 1024 * 1024)
     }
 
+    private var currentModelIsMoE: Bool {
+        guard case .ready(let modelId) = engine.state else { return false }
+        return ModelCatalog.all.first(where: { $0.id == modelId })?.isMoE ?? false
+    }
+
+    private var effectiveStreamExpertsSetting: Bool {
+        viewModel.config.effectiveStreamExperts(defaultingTo: currentModelIsMoE)
+    }
+
+    private var ssdStreamingBinding: Binding<Bool> {
+        Binding(
+            get: { effectiveStreamExpertsSetting },
+            set: { viewModel.config.streamExperts = $0 }
+        )
+    }
+
     enum SettingsTab: String, CaseIterable {
         case generation = "Generation"
         case engine = "Engine"
+        case appearance = "Appearance"
         case console = "Console"
         case about = "About"
 
         var icon: String {
             switch self {
-            case .generation: return "slider.horizontal.3"
-            case .engine:     return "cpu"
-            case .console:    return "terminal"
-            case .about:      return "info.circle"
+            case .generation:  return "slider.horizontal.3"
+            case .engine:      return "cpu"
+            case .appearance:  return "paintpalette"
+            case .console:     return "terminal"
+            case .about:       return "info.circle"
             }
         }
     }
@@ -62,6 +84,8 @@ struct SettingsView: View {
                         generationTab
                     case .engine:
                         engineTab
+                    case .appearance:
+                        appearanceTab
                     case .console:
                         consoleTab
                     case .about:
@@ -190,13 +214,16 @@ struct SettingsView: View {
 
                 parameterCard("Output") {
                     sliderRow(
-                        label: "Max Tokens", icon: "text.word.spacing",
+                        label: "Max Response Tokens", icon: "text.word.spacing",
                         value: Binding(
                             get: { Double(viewModel.config.maxTokens) },
                             set: { viewModel.config.maxTokens = Int($0) }
                         ),
-                        range: 128...max(16384.0, Double(engine.maxContextWindow)), step: 128, format: "%.0f",
-                        tint: SwiftBuddyTheme.accent
+                        range: 128...16384.0, step: 128, format: "%.0f",
+                        tint: SwiftBuddyTheme.accent,
+                        hint: engine.maxContextWindow > 0
+                            ? "Max output per reply. Model context window: \(engine.maxContextWindow / 1000)K tokens"
+                            : "Max tokens generated per response (context window shown once model loads)"
                     )
                     sliderRow(
                         label: "Repetition Penalty", icon: "repeat.circle",
@@ -208,6 +235,48 @@ struct SettingsView: View {
                         tint: SwiftBuddyTheme.success,
                         hint: "Higher = less repeating, 1.0 = disabled"
                     )
+
+                    // Seed — optional reproducibility
+                    HStack {
+                        Label("Seed", systemImage: "number")
+                            .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                            .font(.callout)
+                        Spacer()
+                        if let seed = viewModel.config.seed {
+                            Text("\(seed)")
+                                .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                                .font(.callout.monospacedDigit())
+                            Stepper("", value: Binding(
+                                get: { Int(min(seed, UInt64(Int.max))) },
+                                set: { viewModel.config.seed = UInt64($0) }
+                            ), in: 0...Int.max)
+                            .labelsHidden()
+                            Button {
+                                viewModel.config.seed = nil
+                            } label: {
+                                Image(systemName: "xmark.circle.fill")
+                                    .foregroundStyle(SwiftBuddyTheme.textTertiary)
+                            }
+                            .buttonStyle(.plain)
+                        } else {
+                            Text("Random")
+                                .foregroundStyle(SwiftBuddyTheme.textTertiary)
+                                .font(.callout)
+                            Button {
+                                viewModel.config.seed = UInt64.random(in: 0...UInt64(Int.max))
+                            } label: {
+                                Image(systemName: "lock.fill")
+                                    .foregroundStyle(SwiftBuddyTheme.accent)
+                            }
+                            .buttonStyle(.plain)
+                        }
+                    }
+                    .padding(.vertical, 2)
+                    if viewModel.config.seed != nil {
+                        Text("Fixed seed — same input will produce identical output")
+                            .font(.caption2)
+                            .foregroundStyle(SwiftBuddyTheme.textTertiary)
+                    }
                 }
 
                 parameterCard("Reasoning") {
@@ -219,6 +288,21 @@ struct SettingsView: View {
                     )
                 }
 
+                parameterCard("Performance") {
+                    toggleRow(
+                        label: "SSD Streaming", icon: "internaldrive",
+                        isOn: ssdStreamingBinding,
+                        tint: SwiftBuddyTheme.warning,
+                        hint: "Stream MoE expert weights from NVMe (requires model reload)"
+                    )
+                    toggleRow(
+                        label: "TurboQuant KV", icon: "bolt.badge.clock",
+                        isOn: $viewModel.config.turboKV,
+                        tint: SwiftBuddyTheme.success,
+                        hint: "3-bit KV compression for massive context windows"
+                    )
+                }
+
                 parameterCard("System Prompt") {
                     TextEditor(text: $viewModel.systemPrompt)
                         .frame(minHeight: 80)
@@ -235,8 +319,8 @@ struct SettingsView: View {
 
                 // Reset button
                 Button(role: .destructive) {
-                    viewModel.config = .default
-                    viewModel.systemPrompt = ""
+                    viewModel.config = .default   // didSet triggers config.save()
+                    viewModel.systemPrompt = ""    // didSet clears UserDefaults key
                 } label: {
                     HStack {
                         Image(systemName: "arrow.counterclockwise")
@@ -256,6 +340,39 @@ struct SettingsView: View {
             }
             .padding(.top, 8)
         }
+        // Generation params are hot-applied per request — no restart needed.
+        // Flash a brief badge so the user knows the change was captured.
+        .onChange(of: viewModel.config.temperature)       { flashApplied() }
+        .onChange(of: viewModel.config.topP)              { flashApplied() }
+        .onChange(of: viewModel.config.topK)              { flashApplied() }
+        .onChange(of: viewModel.config.minP)              { flashApplied() }
+        .onChange(of: viewModel.config.maxTokens)         { flashApplied() }
+        .onChange(of: viewModel.config.repetitionPenalty) { flashApplied() }
+        .onChange(of: viewModel.config.enableThinking)    { flashApplied() }
+        .onChange(of: viewModel.config.kvBits)            { flashApplied() }
+        .onChange(of: viewModel.config.prefillSize)       { flashApplied() }
+        .onChange(of: viewModel.config.seed)              { flashApplied() }
+        .overlay(alignment: .top) {
+            if showAppliedBadge {
+                HStack(spacing: 6) {
+                    Image(systemName: "checkmark.circle.fill")
+                        .foregroundStyle(SwiftBuddyTheme.success)
+                        .font(.caption)
+                    Text("Applied — takes effect on next message")
+                        .font(.caption.weight(.medium))
+                        .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                }
+                .padding(.horizontal, 14)
+                .padding(.vertical, 8)
+                .background(.ultraThinMaterial)
+                .background(SwiftBuddyTheme.success.opacity(0.12))
+                .clipShape(Capsule())
+                .overlay(Capsule().strokeBorder(SwiftBuddyTheme.success.opacity(0.3), lineWidth: 1))
+                .padding(.top, 8)
+                .transition(.move(edge: .top).combined(with: .opacity))
+                .animation(.easeInOut(duration: 0.2), value: showAppliedBadge)
+            }
+        }
     }
 
     // MARK: — Engine Tab
@@ -264,14 +381,57 @@ struct SettingsView: View {
         ScrollView {
             VStack(spacing: 16) {
                 parameterCard("Local API Server") {
-                    HStack {
-                        Label(server.isOnline ? "Online" : "Offline", systemImage: "network")
-                            .foregroundStyle(server.isOnline ? SwiftBuddyTheme.success : SwiftBuddyTheme.textSecondary)
-                            .font(.callout.weight(.medium))
-                        Spacer()
-                        Text("\(server.host):\(server.port)")
-                            .foregroundStyle(SwiftBuddyTheme.textSecondary)
-                            .font(.callout.monospacedDigit())
+                    // ── Endpoint URL card (tap to copy) ─────────────────────
+                    let endpointURL = "http://\(server.host):\(server.port)"
+                    Button {
+                        copyEndpoint(endpointURL)
+                    } label: {
+                        HStack(spacing: 12) {
+                            // Status dot
+                            Circle()
+                                .fill(server.isOnline ? SwiftBuddyTheme.success : SwiftBuddyTheme.textTertiary)
+                                .frame(width: 8, height: 8)
+                                .shadow(color: server.isOnline ? SwiftBuddyTheme.success.opacity(0.6) : .clear,
+                                        radius: 4)
+
+                            VStack(alignment: .leading, spacing: 2) {
+                                Text(server.isOnline ? "Online" : "Offline")
+                                    .font(.caption2.weight(.semibold))
+                                    .foregroundStyle(server.isOnline ? SwiftBuddyTheme.success : SwiftBuddyTheme.textTertiary)
+                                Text(endpointURL)
+                                    .font(.system(.callout, design: .monospaced))
+                                    .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                            }
+
+                            Spacer()
+
+                            // Copy / confirm icon
+                            Image(systemName: endpointCopied ? "checkmark" : "doc.on.doc")
+                                .font(.caption)
+                                .foregroundStyle(endpointCopied ? SwiftBuddyTheme.success : SwiftBuddyTheme.textTertiary)
+                                .animation(.easeInOut(duration: 0.2), value: endpointCopied)
+                        }
+                        .padding(12)
+                        .frame(maxWidth: .infinity)
+                        .background(SwiftBuddyTheme.background.opacity(0.6))
+                        .clipShape(RoundedRectangle(cornerRadius: 10))
+                        .overlay(
+                            RoundedRectangle(cornerRadius: 10)
+                                .strokeBorder(
+                                    server.isOnline
+                                        ? SwiftBuddyTheme.success.opacity(0.3)
+                                        : Color.white.opacity(0.07),
+                                    lineWidth: 1
+                                )
+                        )
+                    }
+                    .buttonStyle(.plain)
+
+                    // Quick-use hint for external tools
+                    if server.isOnline {
+                        Text("Compatible with OpenAI SDK, LM Studio, Continue, Cursor")
+                            .font(.caption2)
+                            .foregroundStyle(SwiftBuddyTheme.textTertiary)
                     }
 
                     toggleRow(
@@ -376,22 +536,92 @@ struct SettingsView: View {
                     )
                 }
 
-                parameterCard("Appearance") {
-                    HStack {
-                        Label("Color Scheme", systemImage: "paintpalette")
-                            .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                            .font(.callout)
-                        Spacer()
-                    }
-                    Picker("", selection: $appearance.preference) {
-                        HStack { Image(systemName: "moon.fill"); Text("Dark") }.tag("dark")
-                        HStack { Image(systemName: "sun.max.fill"); Text("Light") }.tag("light")
-                        HStack { Image(systemName: "circle.lefthalf.filled"); Text("System") }.tag("system")
+                parameterCard("Advanced Engine") {
+                    // ── TurboKV (per-request, no reload needed) ──────────────────────────
+                    toggleRow(
+                        label: "TurboKV Compression", icon: "memorychip",
+                        isOn: $viewModel.config.turboKV,
+                        tint: SwiftBuddyTheme.warning,
+                        hint: "3-bit PolarQuant+QJL compression for KV history >8K tokens. Halves long-context RAM — applied per request"
+                    )
+
+                    Divider().background(SwiftBuddyTheme.divider)
+
+                    // ── SSD Expert Streaming (load-time — shows reload prompt) ────
+                    VStack(alignment: .leading, spacing: 6) {
+                        toggleRow(
+                            label: "SSD Expert Streaming", icon: "externaldrive.fill",
+                            isOn: ssdStreamingBinding,
+                            tint: SwiftBuddyTheme.accentSecondary,
+                            hint: "mmap expert weights from NVMe — only active expert pages stay in RAM. Auto-enabled for MoE catalog models."
+                        )
+                        if effectiveStreamExpertsSetting != currentModelIsMoE {
+                            VStack(alignment: .leading, spacing: 8) {
+                                HStack(spacing: 6) {
+                                    Image(systemName: "arrow.clockwise.circle.fill")
+                                        .foregroundStyle(SwiftBuddyTheme.warning)
+                                        .font(.caption)
+                                    Text("Reload model to apply this change")
+                                        .font(.caption2.weight(.medium))
+                                        .foregroundStyle(SwiftBuddyTheme.warning)
+                                    Spacer()
+                                    Button("Reload") {
+                                        let currentId: String? = {
+                                            if case .ready(let id) = engine.state { return id }
+                                            return nil
+                                        }()
+                                        if let id = currentId {
+                                            Task {
+                                                engine.unload()
+                                                await engine.load(modelId: id)
+                                            }
+                                        }
+                                    }
+                                    .font(.caption2.weight(.semibold))
+                                    .foregroundStyle(SwiftBuddyTheme.accent)
+                                    .buttonStyle(.plain)
+                                }
+
+                                switch engine.state {
+                                case .loading(let progress, let stage):
+                                    VStack(alignment: .leading, spacing: 4) {
+                                        HStack {
+                                            Text(stage)
+                                                .font(.caption2.weight(.medium))
+                                                .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                                            Spacer()
+                                            Text("\(Int(progress * 100))%")
+                                                .font(.caption2.monospacedDigit())
+                                                .foregroundStyle(SwiftBuddyTheme.textTertiary)
+                                        }
+                                        ProgressView(value: progress)
+                                            .tint(SwiftBuddyTheme.accent)
+                                    }
+                                case .downloading(let progress, let speed):
+                                    VStack(alignment: .leading, spacing: 4) {
+                                        HStack {
+                                            Text("Downloading model files")
+                                                .font(.caption2.weight(.medium))
+                                                .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                                            Spacer()
+                                            Text("\(Int(progress * 100))% · \(speed)")
+                                                .font(.caption2.monospacedDigit())
+                                                .foregroundStyle(SwiftBuddyTheme.textTertiary)
+                                        }
+                                        ProgressView(value: progress)
+                                            .tint(SwiftBuddyTheme.accent)
+                                    }
+                                default:
+                                    EmptyView()
+                                }
+                            }
+                            .padding(.horizontal, 4)
+                            .padding(.vertical, 6)
+                            .background(SwiftBuddyTheme.warning.opacity(0.08))
+                            .clipShape(RoundedRectangle(cornerRadius: 8))
+                        }
                     }
-                    .pickerStyle(.segmented)
-                    .tint(SwiftBuddyTheme.accent)
                 }
-
                 #if os(iOS)
                 parameterCard("iOS Performance") {
                     toggleRow(
@@ -414,12 +644,80 @@ struct SettingsView: View {
                 }
                 #endif
 
+                // ── CLI Equivalent ──────────────────────────────────────────
+                parameterCard("CLI Equivalent") {
+                    Text("Run standalone server with these settings:")
+                        .font(.caption2)
+                        .foregroundStyle(SwiftBuddyTheme.textTertiary)
+
+                    ScrollView(.horizontal, showsIndicators: false) {
+                        Text(cliCommand)
+                            .font(.system(size: 11, design: .monospaced))
+                            .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                            .textSelection(.enabled)
+                            .padding(.vertical, 6)
+                    }
+
+                    Button {
+                        copyCLI()
+                    } label: {
+                        Label(
+                            cliCopied ? "Copied!" : "Copy Command",
+                            systemImage: cliCopied ? "checkmark" : "doc.on.doc"
+                        )
+                        .font(.caption.weight(.medium))
+                        .frame(maxWidth: .infinity)
+                    }
+                    .buttonStyle(.bordered)
+                    .tint(cliCopied ? SwiftBuddyTheme.success : SwiftBuddyTheme.accent)
+                    .animation(.easeInOut(duration: 0.2), value: cliCopied)
+                }
                 Spacer(minLength: 20)
             }
             .padding(.top, 8)
         }
     }
 
+    // MARK: - Appearance Tab
+
+    // Use local state for the picker to avoid triggering a @Published write
+    // directly from within a view update cycle, which causes the crash:
+    // "Publishing changes from within view updates is not allowed"
+    @State private var localColorScheme: String = "dark"
+
+    private var appearanceTab: some View {
+        ScrollView {
+            VStack(spacing: 16) {
+                parameterCard("Theme") {
+                    HStack {
+                        Label("Color Scheme", systemImage: "paintpalette")
+                            .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                            .font(.callout)
+                        Spacer()
+                    }
+                    Picker("", selection: $localColorScheme) {
+                        Text("Dark").tag("dark")
+                        Text("Light").tag("light")
+                        Text("System").tag("system")
+                    }
+                    .pickerStyle(.segmented)
+                    .tint(SwiftBuddyTheme.accent)
+                    .onChange(of: localColorScheme) { newValue in
+                        // Defer the @Published write to avoid the view update crash
+                        Task { @MainActor in
+                            appearance.preference = newValue
+                        }
+                    }
+                }
+            }
+            .padding(.horizontal, 16)
+            .padding(.bottom, 24)
+        }
+        .onAppear {
+            localColorScheme = appearance.preference
+        }
+    }
+
     // MARK: — Console Tab
 
     private var consoleTab: some View {
@@ -599,6 +897,59 @@ struct SettingsView: View {
         .shadow(color: .black.opacity(0.18), radius: 14, y: 6)
     }
 
+    private func flashApplied() {
+        withAnimation { showAppliedBadge = true }
+        // Cancel any pending hide before scheduling a new one to prevent
+        // stacked closures from causing flicker when sliders are dragged rapidly.
+        toastHideWork?.cancel()
+        let work = DispatchWorkItem {
+            withAnimation { showAppliedBadge = false }
+        }
+        toastHideWork = work
+        DispatchQueue.main.asyncAfter(deadline: .now() + 2, execute: work)
+    }
+
+    /// Build the equivalent `swift run SwiftLM` command from current settings.
+    private var cliCommand: String {
+        buildCLICommand(
+            config: viewModel.config,
+            host: server.host,
+            port: server.port,
+            parallel: server.startupConfiguration.parallelSlots,
+            apiKeySet: !server.startupConfiguration.apiKey.isEmpty,
+            modelId: {
+                if case .ready(let id) = engine.state { return id }
+                return nil
+            }()
+        )
+    }
+
+    private func copyCLI() {
+        #if os(macOS)
+        NSPasteboard.general.clearContents()
+        NSPasteboard.general.setString(cliCommand, forType: .string)
+        #else
+        UIPasteboard.general.string = cliCommand
+        #endif
+        withAnimation { cliCopied = true }
+        DispatchQueue.main.asyncAfter(deadline: .now() + 2) {
+            withAnimation { cliCopied = false }
+        }
+    }
+
+    private func copyEndpoint(_ url: String) {
+        #if os(macOS)
+        NSPasteboard.general.clearContents()
+        NSPasteboard.general.setString(url, forType: .string)
+        #else
+        UIPasteboard.general.string = url
+        #endif
+        withAnimation { endpointCopied = true }
+        DispatchQueue.main.asyncAfter(deadline: .now() + 2) {
+            withAnimation { endpointCopied = false }
+        }
+    }
+
     private func saveServerConfiguration() {
         let changed = server.saveStartupConfiguration(draftServerConfiguration)
         draftServerConfiguration = server.startupConfiguration
diff --git a/tests/SwiftLMTests/ThinkingTagStripTests.swift b/tests/SwiftLMTests/ThinkingTagStripTests.swift
new file mode 100644
index 0000000..b258f44
--- /dev/null
+++ b/tests/SwiftLMTests/ThinkingTagStripTests.swift
@@ -0,0 +1,157 @@
+// ThinkingTagStripTests.swift — Regression tests for Issue #97
+//
+// Verifies two fixes:
+//   1. stripThinkingTags() correctly removes <think>…</think> blocks from
+//      assistant history messages so they never re-enter the Jinja template.
+//   2. The role mapping for "assistant" is NOT changed to "model" (Qwen3 fix).
+//
+// stripThinkingTags is private at file scope in InferenceEngine.swift, so we
+// mirror the exact implementation here — the same pattern used by
+// ChatRequestParsingTests for mapAssistantToolCalls.
+
+import XCTest
+import Foundation
+@testable import SwiftLM
+import MLXInferenceCore
+
+final class ThinkingTagStripTests: XCTestCase {
+
+    // ── Mirror of the production helper (InferenceEngine.swift) ───────────────
+    // Keep in sync if the production implementation changes.
+
+    private func stripThinkingTags(from text: String) -> String {
+        var result = text
+        while let openRange = result.range(of: "<think>") {
+            if let closeRange = result.range(of: "</think>", range: openRange.lowerBound..<result.endIndex) {
+                var endIdx = closeRange.upperBound
+                if endIdx < result.endIndex && result[endIdx] == "\n" {
+                    endIdx = result.index(after: endIdx)
+                }
+                result.removeSubrange(openRange.lowerBound..<endIdx)
+            } else {
+                result.removeSubrange(openRange.lowerBound...)
+                break
+            }
+        }
+        return result.trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - 1. Basic stripping
+    // ═══════════════════════════════════════════════════════════════════
+
+    func testStrip_SingleThinkBlock_LeavesOnlyVisible() {
+        let input = "<think>Let me reason step by step.</think>\nHello! 👋"
+        XCTAssertEqual(stripThinkingTags(from: input), "Hello! 👋")
+    }
+
+    func testStrip_ThinkBlockOnly_ReturnsEmpty() {
+        let input = "<think>internal monologue</think>"
+        XCTAssertEqual(stripThinkingTags(from: input), "")
+    }
+
+    func testStrip_NoThinkBlock_ReturnsTrimmedOriginal() {
+        let input = "  Hello, how can I help?  "
+        XCTAssertEqual(stripThinkingTags(from: input), "Hello, how can I help?")
+    }
+
+    func testStrip_MultipleThinkBlocks() {
+        // Qwen3 can emit multiple <think> sections in one reply
+        let input = "<think>first</think>\nVisible A\n<think>second</think>\nVisible B"
+        XCTAssertEqual(stripThinkingTags(from: input), "Visible A\nVisible B")
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - 2. Edge cases
+    // ═══════════════════════════════════════════════════════════════════
+
+    func testStrip_UnclosedThinkTag_StripsToEndOfString() {
+        // If generation was interrupted mid-think, the closing tag may be absent.
+        let input = "Visible prefix\n<think>reasoning that never closed"
+        XCTAssertEqual(stripThinkingTags(from: input), "Visible prefix")
+    }
+
+    func testStrip_EmptyThinkBlock_RemovesTagsOnly() {
+        let input = "<think></think>The actual answer."
+        XCTAssertEqual(stripThinkingTags(from: input), "The actual answer.")
+    }
+
+    func testStrip_MultilineThinkBlock() {
+        let input = """
+        <think>
+        Line one of reasoning.
+        Line two of reasoning.
+        </think>
+        The final answer.
+        """
+        XCTAssertEqual(stripThinkingTags(from: input), "The final answer.")
+    }
+
+    func testStrip_ThinkBlockWithTrailingNewline_ConsumesNewline() {
+        // The production helper eats the single newline after </think>
+        // so the visible content doesn't start with a blank line.
+        let input = "<think>thought</think>\nAnswer starts here"
+        let result = stripThinkingTags(from: input)
+        XCTAssertFalse(result.hasPrefix("\n"), "Result must not start with a stray newline")
+        XCTAssertEqual(result, "Answer starts here")
+    }
+
+    func testStrip_ContentBeforeAndAfterThink() {
+        // Reproduces the exact shape of Qwen3 output with thinking ON:
+        // the UI shows the <think> block inline and the answer follows.
+        let input = "<think>\nThe user is asking me to continue a Russian tongue-twister.\nNo tool calls needed.\n</think>\nЕхал грека через реку,\nВидит грека — в реке рак."
+        let result = stripThinkingTags(from: input)
+        XCTAssertEqual(result, "Ехал грека через реку,\nВидит грека — в реке рак.")
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - 3. Issue #97 crash reproducer
+    // ═══════════════════════════════════════════════════════════════════
+
+    func testStrip_Issue97_SecondTurnMessageShape() {
+        // This is the exact assistant content that caused TemplateException error 1
+        // when fed back unmodified into the Jinja template on turn 2.
+        let turn1AssistantOutput = """
+        <think>
+        The user said "Hi!" as a greeting. Let me check my available tools and context. \
+        No tool calls needed here — just a simple greeting.
+        </think>
+        Hello! 👋 It's great to meet you. How can I assist you today?
+        """
+        let stripped = stripThinkingTags(from: turn1AssistantOutput)
+
+        // After stripping, no <think> tag should remain
+        XCTAssertFalse(stripped.contains("<think>"), "Stripped content must not contain <think>")
+        XCTAssertFalse(stripped.contains("</think>"), "Stripped content must not contain </think>")
+
+        // The visible reply must be preserved
+        XCTAssertTrue(stripped.contains("Hello!"), "Visible reply must survive stripping")
+    }
+
+    // ═══════════════════════════════════════════════════════════════════
+    // MARK: - 4. Role mapping regression guard (Issue #97)
+    // ═══════════════════════════════════════════════════════════════════
+    // The ChatCompletionRequest pipeline in Server.swift passes roles through
+    // as-is. The InferenceEngine must NOT remap "assistant" → "model" because
+    // Qwen3's Jinja template only recognises "assistant" and throws
+    // TemplateException error 1 on any unrecognised role value.
+
+    func testRoleMapping_AssistantRawValue_IsAssistant() {
+        // ChatMessage.Role.assistant.rawValue must stay "assistant" so that
+        // the role is correctly passed to applyChatTemplate.
+        // If someone changes the enum rawValue, this test fails loudly.
+        XCTAssertEqual(
+            ChatMessage.Role.assistant.rawValue,
+            "assistant",
+            "Role.assistant rawValue must be 'assistant', not 'model' — Qwen3 Jinja template fix (Issue #97)"
+        )
+    }
+
+    func testRoleMapping_AllRolesHaveExpectedRawValues() {
+        // Canonical role strings for the OpenAI-compatible message protocol.
+        XCTAssertEqual(ChatMessage.Role.system.rawValue,    "system")
+        XCTAssertEqual(ChatMessage.Role.user.rawValue,      "user")
+        XCTAssertEqual(ChatMessage.Role.assistant.rawValue, "assistant")
+        XCTAssertEqual(ChatMessage.Role.tool.rawValue,      "tool")
+    }
+}