diff --git a/Package.swift b/Package.swift index 42bccb6..9286564 100644 --- a/Package.swift +++ b/Package.swift @@ -117,7 +117,7 @@ let package = Package( ), .testTarget( name: "SwiftLMTests", - dependencies: ["SwiftLM"] + dependencies: ["SwiftLM", "MLXInferenceCore"] ) ] ) diff --git a/Sources/MLXInferenceCore/CLICommandBuilder.swift b/Sources/MLXInferenceCore/CLICommandBuilder.swift new file mode 100644 index 0000000..833aaf3 --- /dev/null +++ b/Sources/MLXInferenceCore/CLICommandBuilder.swift @@ -0,0 +1,68 @@ +// CLICommandBuilder.swift — Pure function for building the equivalent CLI command +// Lives in MLXInferenceCore so it can be unit-tested by SwiftLMTests without +// requiring the SwiftBuddy app target. +import Foundation + +/// Builds the equivalent `swift run SwiftLM` command string from persisted settings. +/// Only emits flags that differ from the CLI defaults, keeping the command readable. +/// +/// - Parameters: +/// - config: The current `GenerationConfig`. +/// - host: The server host string (e.g. "127.0.0.1"). +/// - port: The server port (e.g. 5413). +/// - parallel: Number of parallel request slots (default 1). +/// - apiKeySet: `true` if an API key is configured (key itself is redacted). +/// - modelId: The currently loaded model ID, or `nil` when no model is loaded. +/// - Returns: A multi-line shell command string suitable for display and copy. +public func buildCLICommand( + config: GenerationConfig, + host: String, + port: Int, + parallel: Int, + apiKeySet: Bool, + modelId: String? +) -> String { + var parts: [String] = [] + + parts.append("--model \(modelId ?? "")") + parts.append("--host \(host)") + parts.append("--port \(port)") + parts.append("--max-tokens \(config.maxTokens)") + parts.append("--temp \(String(format: "%.2f", config.temperature))") + + if config.topP < 1.0 { + parts.append("--top-p \(String(format: "%.2f", config.topP))") + } + if config.topK != 50 { + parts.append("--top-k \(config.topK)") + } + if config.minP > 0 { + parts.append("--min-p \(String(format: "%.2f", config.minP))") + } + if config.repetitionPenalty != 1.05 { + parts.append("--repeat-penalty \(String(format: "%.2f", config.repetitionPenalty))") + } + if config.prefillSize != 512 { + parts.append("--prefill-size \(config.prefillSize)") + } + if let kvBits = config.kvBits { + parts.append("--kv-bits \(kvBits)") + if config.kvGroupSize != 64 { + parts.append("--kv-group-size \(config.kvGroupSize)") + } + } + if config.enableThinking { + parts.append("--thinking") + } + if let seed = config.seed { + parts.append("--seed \(seed)") + } + if parallel > 1 { + parts.append("--parallel \(parallel)") + } + if apiKeySet { + parts.append("--api-key ") + } + + return "swift run SwiftLM " + parts.joined(separator: " \\\n ") +} diff --git a/Sources/MLXInferenceCore/GenerationConfig.swift b/Sources/MLXInferenceCore/GenerationConfig.swift index 9ec4186..97c77a0 100644 --- a/Sources/MLXInferenceCore/GenerationConfig.swift +++ b/Sources/MLXInferenceCore/GenerationConfig.swift @@ -1,24 +1,29 @@ // GenerationConfig.swift — SwiftLM inference parameters import Foundation -/// Configuration for a single generation request. -public struct GenerationConfig: Sendable { +/// Per-request generation parameters, persisted across app launches via UserDefaults. +/// +/// ### Field classification +/// **Per-request** (applied on every `generate()` call — no reload needed): +/// temperature, topP, topK, minP, repetitionPenalty, seed, enableThinking, +/// prefillSize, kvBits, kvGroupSize, turboKV +/// +/// **Load-time** (requires model reload to take effect): +/// streamExperts — controls SSD expert streaming for MoE and large models. +/// Stored here for persistence but applied by InferenceEngine at load time. +public struct GenerationConfig: Sendable, Codable { public var maxTokens: Int public var temperature: Float public var topP: Float public var topK: Int public var minP: Float public var repetitionPenalty: Float - public var seed: UInt64? - public var enableThinking: Bool - // ── SwiftLM Engine Parameters ────────────────────────────────────── - /// Enable TurboQuant KV-cache compression (3-bit PolarQuant+QJL). - /// Compresses KV history > 8192 tokens to ~3.5 bits/token. - public var turboKV: Bool + /// Optional RNG seed for reproducible outputs. + /// When non-nil, `MLX.seed(seed)` is called before each generation using this `UInt64` value. + public var seed: UInt64? - /// Enable SSD expert streaming for MoE models. - public var streamExperts: Bool + public var enableThinking: Bool /// Chunk size for prefill evaluation. /// Lower values prevent GPU timeout on large models. @@ -30,6 +35,21 @@ public struct GenerationConfig: Sendable { /// KV-cache quantization group size (default 64). public var kvGroupSize: Int + /// Enable 3-bit TurboQuant KV-cache compression (PolarQuant+QJL). + /// Compresses KV history older than 8192 tokens to ~3.5 bits/token. + /// Recommended for 100k+ context to halve KV RAM usage. + /// Applied per-request — no model reload needed. + public var turboKV: Bool + + /// Enable SSD expert streaming for MoE (and any large) models. + /// When true, expert weights are mmap'd from NVMe and only active + /// expert pages reside in RAM during inference (Flash-MoE style). + /// ⚠️ LOAD-TIME flag: changes take effect on the next model load. + /// MoE models (isMoE == true) default to true automatically; + /// this flag lets users override that for non-catalog models or + /// force-disable streaming even on MoE models. + public var streamExperts: Bool + public init( maxTokens: Int = 2048, temperature: Float = 0.6, @@ -39,11 +59,11 @@ public struct GenerationConfig: Sendable { repetitionPenalty: Float = 1.05, seed: UInt64? = nil, enableThinking: Bool = false, - turboKV: Bool = false, - streamExperts: Bool = false, prefillSize: Int = 512, kvBits: Int? = nil, - kvGroupSize: Int = 64 + kvGroupSize: Int = 64, + turboKV: Bool = false, + streamExperts: Bool = false ) { self.maxTokens = maxTokens self.temperature = temperature @@ -53,12 +73,41 @@ public struct GenerationConfig: Sendable { self.repetitionPenalty = repetitionPenalty self.seed = seed self.enableThinking = enableThinking - self.turboKV = turboKV - self.streamExperts = streamExperts self.prefillSize = prefillSize self.kvBits = kvBits self.kvGroupSize = kvGroupSize + self.turboKV = turboKV + self.streamExperts = streamExperts } public static let `default` = GenerationConfig() + + // MARK: — Persistence + + private static let storageKey = "swiftlm.generationConfig" + + /// True when the user has previously saved a GenerationConfig. + /// Used to distinguish the first-run/default state from an explicit choice. + public static var hasPersistedConfig: Bool { + UserDefaults.standard.object(forKey: storageKey) != nil + } + + /// Computes the effective SSD streaming setting. + /// Before the user has saved settings, MoE models default to streaming on. + /// After settings are persisted, the saved toggle becomes authoritative. + public func effectiveStreamExperts(defaultingTo defaultValue: Bool) -> Bool { + Self.hasPersistedConfig ? streamExperts : defaultValue + } + + public func save() { + guard let data = try? JSONEncoder().encode(self) else { return } + UserDefaults.standard.set(data, forKey: Self.storageKey) + } + + public static func load() -> GenerationConfig { + guard let data = UserDefaults.standard.data(forKey: storageKey), + let decoded = try? JSONDecoder().decode(GenerationConfig.self, from: data) + else { return .default } + return decoded + } } diff --git a/Sources/MLXInferenceCore/InferenceEngine.swift b/Sources/MLXInferenceCore/InferenceEngine.swift index 38d5b39..28eb225 100644 --- a/Sources/MLXInferenceCore/InferenceEngine.swift +++ b/Sources/MLXInferenceCore/InferenceEngine.swift @@ -72,7 +72,7 @@ private struct TransformersTokenizerBridge: MLXLMCommon.Tokenizer, Sendable { public enum ModelState: Equatable, Sendable { case idle case downloading(progress: Double, speed: String) - case loading + case loading(progress: Double, stage: String) case ready(modelId: String) case generating case error(String) @@ -319,7 +319,7 @@ public final class InferenceEngine: ObservableObject { } private func loadVerifiedModel(modelId: String) async { - state = .loading + setLoadingState(progress: 0.05, stage: "Preparing model configuration") currentModelId = modelId do { @@ -331,10 +331,13 @@ public final class InferenceEngine: ObservableObject { // at load time — only active expert pages touch RAM during inference. var config = ModelConfiguration(id: modelId) let isMoE = ModelCatalog.all.first(where: { $0.id == modelId })?.isMoE ?? false - if isMoE { + let generationConfig = GenerationConfig.load() + // SSD expert streaming defaults ON for MoE until the user saves a preference. + // Once persisted, the saved toggle becomes authoritative for all models. + let shouldStream = generationConfig.effectiveStreamExperts(defaultingTo: isMoE) + if shouldStream { config.lazyLoad = true let modelDir = ModelStorage.snapshotDirectory(for: modelId) - // directIO=true on macOS (5 GB/s NVMe pread), false on iOS (mmap fallback) ExpertStreamingConfig.shared.activate( modelDirectory: modelDir, useDirectIO: { @@ -345,15 +348,23 @@ public final class InferenceEngine: ObservableObject { #endif }() ) + print("[InferenceEngine] SSD expert streaming: enabled (isMoE=\(isMoE), persisted=\(GenerationConfig.hasPersistedConfig), setting=\(generationConfig.streamExperts))") + } else { + print("[InferenceEngine] SSD expert streaming: disabled") } + setLoadingState(progress: 0.15, stage: "Inspecting model architecture") let downloader = HubDownloader(hub: hub) let architecture = try await ModelArchitectureProbe.inspect( configuration: config, downloader: downloader ) - let speedTracker = DownloadSpeedTracker() + let loadingStage = architecture.supportsVision + ? "Loading multimodal model" + : "Loading language model" + + setLoadingState(progress: 0.22, stage: loadingStage) if architecture.supportsVision { container = try await VLMModelFactory.shared.loadContainer( @@ -361,22 +372,10 @@ public final class InferenceEngine: ObservableObject { using: TransformersTokenizerLoader(), configuration: config ) { [weak self] progress in - speedTracker.record(totalBytes: progress.completedUnitCount) - let smoothedSpeed = speedTracker.speedBytesPerSec - Task { @MainActor in guard let self else { return } let pct = progress.fractionCompleted - let speedStr = smoothedSpeed - .map { String(format: "%.1f MB/s", $0 / 1_000_000) } ?? "" - self.state = .downloading(progress: pct, speed: speedStr) - - self.downloadManager.updateProgress(ModelDownloadProgress( - modelId: modelId, - fractionCompleted: pct, - currentFile: "", - speedMBps: smoothedSpeed.map { $0 / 1_000_000 } - )) + self.setLoadingState(progress: 0.22 + (pct * 0.68), stage: loadingStage) } } } else { @@ -385,22 +384,10 @@ public final class InferenceEngine: ObservableObject { using: TransformersTokenizerLoader(), configuration: config ) { [weak self] progress in - speedTracker.record(totalBytes: progress.completedUnitCount) - let smoothedSpeed = speedTracker.speedBytesPerSec - Task { @MainActor in guard let self else { return } let pct = progress.fractionCompleted - let speedStr = smoothedSpeed - .map { String(format: "%.1f MB/s", $0 / 1_000_000) } ?? "" - self.state = .downloading(progress: pct, speed: speedStr) - - self.downloadManager.updateProgress(ModelDownloadProgress( - modelId: modelId, - fractionCompleted: pct, - currentFile: "", - speedMBps: smoothedSpeed.map { $0 / 1_000_000 } - )) + self.setLoadingState(progress: 0.22 + (pct * 0.68), stage: loadingStage) } } } @@ -410,11 +397,13 @@ public final class InferenceEngine: ObservableObject { downloadManager.refresh() // Verify integrity to catch incomplete downloads before marking as ready + setLoadingState(progress: 0.94, stage: "Verifying model files") guard ModelStorage.verifyModelIntegrity(for: modelId) else { throw NSError(domain: "InferenceEngine", code: 1, userInfo: [NSLocalizedDescriptionKey: "Model safetensors files are incomplete. Please delete and re-download."]) } // Read the model's actual max context length from config.json + setLoadingState(progress: 0.98, stage: "Reading model limits") if let ctxLen = ModelStorage.readMaxContextLength(for: modelId) { self.maxContextWindow = ctxLen print("[InferenceEngine] Model context window: \(ctxLen) tokens") @@ -464,6 +453,10 @@ public final class InferenceEngine: ObservableObject { MLX.Memory.cacheLimit = 0 } + private func setLoadingState(progress: Double, stage: String) { + state = .loading(progress: min(max(progress, 0), 1), stage: stage) + } + private func markModelCorrupted(modelId: String?, message: String) { let failedModelId = modelId ?? currentModelId releaseLoadedModelResources() @@ -488,6 +481,44 @@ public final class InferenceEngine: ObservableObject { } // MARK: — Generation +} + +// MARK: — Helpers + +/// Removes all `` spans from `text`, including the closing tag's +/// trailing newline when present. Used to sanitise assistant history messages +/// before they are re-submitted to the Jinja chat-template renderer on subsequent +/// turns — Qwen3 (and similar "thinking" models) raise TemplateException error 1 +/// when prior assistant turns contain raw thinking tags. +/// +/// Trimming is applied only when at least one tag span was actually removed so +/// that assistant messages without thinking content are returned byte-for-byte +/// (preserving leading spaces, code-block indentation, etc.). +func stripThinkingTags(from text: String) -> String { + var result = text + var stripped = false + while let openRange = result.range(of: "") { + stripped = true + if let closeRange = result.range(of: "", range: openRange.lowerBound.. + var endIdx = closeRange.upperBound + if endIdx < result.endIndex && result[endIdx] == "\n" { + endIdx = result.index(after: endIdx) + } + result.removeSubrange(openRange.lowerBound.. — strip from opening tag to end of string + result.removeSubrange(openRange.lowerBound...) + break + } + } + // Only trim surrounding whitespace that was introduced by stripping; + // leave untouched messages that contained no think tags. + return stripped ? result.trimmingCharacters(in: .whitespacesAndNewlines) : result +} + +extension InferenceEngine { + // MARK: — Generation (continued) public nonisolated func generate( messages: [ChatMessage], @@ -515,10 +546,21 @@ public final class InferenceEngine: ObservableObject { if msg.role == .system { pendingSystemContext += msg.content + "\n\n" } else { - var roleRaw = msg.role.rawValue - if roleRaw == "assistant" { roleRaw = "model" } + // Use the canonical role name — Qwen3 (and most models) use + // "assistant", not "model". The "model" alias is Gemma-specific + // and breaks Qwen3's Jinja chat template on multi-turn history. + let roleRaw = msg.role.rawValue // "user" | "assistant" | "tool" var content = msg.content + // Strip blocks from prior assistant turns. + // If the model generated thinking content on a previous turn and + // it was not already split into thinkingContent, the raw tags will + // be present in `content`. Feeding them back into the Jinja template + // on the next request causes TemplateException error 1 on Qwen3. + if msg.role == .assistant { + content = stripThinkingTags(from: content) + } + if roleRaw == "user" && !pendingSystemContext.isEmpty { content = "[SYSTEM CONTEXT / PERSONA DATA]\n" + pendingSystemContext + "\n[END CONTEXT]\n\n" + content pendingSystemContext = "" // Clear after injecting @@ -545,17 +587,46 @@ public final class InferenceEngine: ObservableObject { var outputText = "" var tokenCount = 0 - let userInput = UserInput(messages: mlxMessages) + // Set RNG seed for reproducible output when requested. + if let seed = config.seed { + MLX.seed(seed) + } + + // Pass enable_thinking to the Jinja chat template so the model + // actually generates blocks when thinking mode is ON. + // Without this kwarg, Qwen3's template defaults to thinking=false + // regardless of what the UI toggle shows. + let additionalContext: [String: any Sendable]? = config.enableThinking + ? ["enable_thinking": true] + : ["enable_thinking": false] + let userInput = UserInput( + messages: mlxMessages, + additionalContext: additionalContext + ) let lmInput = try await container.prepare(input: userInput) - // Approximate the input token size (as LMInput wrapper blocks direct inspection without private API) - // MLX often counts 1 word roughly as 1.3 tokens. - let stringLength = mlxMessages.map { ($0["content"] ?? "").count }.reduce(0, +) - let baseTokens = Int(Double(stringLength) / 3.5) + // Use the real token count from the prepared LMInput rather than + // a character-length heuristic (which was consistently off by 2–3× + // for CJK and code content). + let baseTokens = lmInput.text.tokens.size self.activeContextTokens = baseTokens // maxContextWindow is already set during loadModel() from config.json - + + // TurboKV: enable 3-bit PolarQuant+QJL on every KVCacheSimple layer + // before generation. Must be set on the model (not the cache) so the + // cache inherits the flag when newCache() is called inside generate(). + if config.turboKV { + await container.perform { ctx in + for module in ctx.model.modules() { + if let simple = module as? KVCacheSimple { + simple.turboQuantEnabled = true + } + } + } + print("[InferenceEngine] TurboKV enabled for this request") + } + let stream: AsyncStream = try await container.generate( input: lmInput, parameters: params diff --git a/SwiftBuddy/SwiftBuddy/ViewModels/ChatViewModel.swift b/SwiftBuddy/SwiftBuddy/ViewModels/ChatViewModel.swift index 5fcf1f6..60542c9 100644 --- a/SwiftBuddy/SwiftBuddy/ViewModels/ChatViewModel.swift +++ b/SwiftBuddy/SwiftBuddy/ViewModels/ChatViewModel.swift @@ -12,14 +12,43 @@ final class ChatViewModel: ObservableObject { @Published var streamingText: String = "" @Published var thinkingText: String? = nil @Published var isGenerating: Bool = false - @Published var config: GenerationConfig = .default - @Published var systemPrompt: String = "" + @Published var config: GenerationConfig = .load() { + didSet { scheduleConfigSave() } + } + @Published var systemPrompt: String = { + UserDefaults.standard.string(forKey: "swiftlm.systemPrompt") ?? "" + }() { + didSet { scheduleSystemPromptSave() } + } public var currentWing: String? = nil weak var engine: InferenceEngine? var modelContext: ModelContext? private var generationTask: Task? private var activeSession: ChatSession? + // MARK: — Debounced persistence + // Saves are debounced at 0.5 s so rapid slider drags or keystrokes + // don't saturate UserDefaults with synchronous writes and cause UI jank. + private var configSaveWork: DispatchWorkItem? + private var systemPromptSaveWork: DispatchWorkItem? + + private func scheduleConfigSave() { + configSaveWork?.cancel() + let work = DispatchWorkItem { [weak self] in self?.config.save() } + configSaveWork = work + DispatchQueue.main.asyncAfter(deadline: .now() + 0.5, execute: work) + } + + private func scheduleSystemPromptSave() { + systemPromptSaveWork?.cancel() + let snapshot = systemPrompt + let work = DispatchWorkItem { + UserDefaults.standard.set(snapshot, forKey: "swiftlm.systemPrompt") + } + systemPromptSaveWork = work + DispatchQueue.main.asyncAfter(deadline: .now() + 0.5, execute: work) + } + // MARK: — Send func send(_ userText: String) async { diff --git a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift index c76c917..3455304 100644 --- a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift +++ b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift @@ -150,10 +150,108 @@ final class ServerManager: ObservableObject { return Response(status: .ok, headers: swiftBuddyJSONHeaders, body: .init(byteBuffer: buffer)) } - // Simple V1 models mock + // ── /v1/models ───────────────────────────────────────── router.get("/v1/models") { _, _ -> Response in - let buffer = ByteBuffer(string: #"{"object": "list", "data": [{"id": "local", "object": "model"}]}"#) - return Response(status: .ok, headers: swiftBuddyJSONHeaders, body: .init(byteBuffer: buffer)) + let modelId: String + switch await engine.state { + case .ready(let id): modelId = id + default: modelId = "none" + } + // Use swiftBuddyJSONString to safely escape the model ID — + // model IDs with slashes (e.g. "mlx-community/Qwen3") are safe, + // but quotes or control chars would break the JSON structure. + let body = "{\"object\":\"list\",\"data\":[{\"id\":\(swiftBuddyJSONString(modelId)),\"object\":\"model\",\"owned_by\":\"swiftbuddy\"}]}" + return Response(status: .ok, headers: swiftBuddyJSONHeaders, + body: .init(byteBuffer: ByteBuffer(string: body))) + } + + // ── /v1/chat/completions (OpenAI-compatible, streaming + non-streaming) ── + router.post("/v1/chat/completions") { request, _ -> Response in + // 1. Parse body + guard let bodyData = try? await request.body.collect(upTo: 4 * 1024 * 1024), + let json = try? JSONSerialization.jsonObject(with: Data(buffer: bodyData)) as? [String: Any] + else { + let err = #"{"error":{"message":"Invalid JSON body","type":"invalid_request_error"}}"# + return Response(status: .badRequest, headers: swiftBuddyJSONHeaders, + body: .init(byteBuffer: ByteBuffer(string: err))) + } + + let streamRequested = json["stream"] as? Bool ?? false + + // 2. Map messages + var chatMessages: [ChatMessage] = [] + if let msgs = json["messages"] as? [[String: Any]] { + for m in msgs { + let role = m["role"] as? String ?? "user" + let content = m["content"] as? String ?? "" + switch role { + case "system", "developer": chatMessages.append(.system(content)) + case "assistant": chatMessages.append(.assistant(content)) + case "tool": chatMessages.append(.tool(content)) + case "user": chatMessages.append(.user(content)) + default: chatMessages.append(.user(content)) + } + } + } + + // 3. Build request config from persisted user defaults + per-request overrides + var reqConfig = GenerationConfig.load() + if let t = json["temperature"] as? Double { reqConfig.temperature = Float(t) } + if let p = json["top_p"] as? Double { reqConfig.topP = Float(p) } + if let mt = json["max_tokens"] as? Int { reqConfig.maxTokens = mt } + if let rp = json["frequency_penalty"] as? Double { reqConfig.repetitionPenalty = Float(rp) } + + let modelId: String + switch await engine.state { + case .ready(let id): modelId = id + default: modelId = "local" + } + let reqId = "chatcmpl-\(UUID().uuidString.prefix(8))" + let created = Int(Date().timeIntervalSince1970) + // Escape model ID once — used in both streaming and non-streaming paths. + // Slashes in HF model IDs (e.g. "mlx-community/Qwen3") are safe inside + // JSON strings, but quotes/control chars in custom model names would break. + let escapedModelId = swiftBuddyJSONString(modelId) + + // Helper: JSON-safe escape for token text using JSONEncoder so ALL + // control chars (U+0000–U+001F) are correctly escaped, not just \n/\r/\t. + func jsonEscape(_ s: String) -> String { + guard let data = try? JSONEncoder().encode(s), + let raw = String(data: data, encoding: .utf8) else { return "\"\"" } + // JSONEncoder wraps in outer quotes — strip them for inline interpolation + return String(raw.dropFirst().dropLast()) + } + + if streamRequested { + // ── SSE streaming ─────────────────────────────────── + var sseHeaders = HTTPFields() + sseHeaders.append(HTTPField(name: .contentType, value: "text/event-stream; charset=utf-8")) + sseHeaders.append(HTTPField(name: HTTPField.Name("Cache-Control")!, value: "no-cache")) + sseHeaders.append(HTTPField(name: HTTPField.Name("X-Accel-Buffering")!, value: "no")) + + let sseStream = AsyncStream { cont in + Task { + for await token in await engine.generate(messages: chatMessages, config: reqConfig) { + let chunk = "{\"id\":\"\(reqId)\",\"object\":\"chat.completion.chunk\",\"created\":\(created),\"model\":\(escapedModelId),\"choices\":[{\"index\":0,\"delta\":{\"content\":\"\(jsonEscape(token.text))\"},\"finish_reason\":null}]}" + cont.yield(ByteBuffer(string: "data: \(chunk)\n\n")) + } + cont.yield(ByteBuffer(string: "data: [DONE]\n\n")) + cont.finish() + } + } + return Response(status: .ok, headers: sseHeaders, + body: .init(asyncSequence: sseStream)) + + } else { + // ── Non-streaming: collect full response ──────────── + var fullText = "" + for await token in await engine.generate(messages: chatMessages, config: reqConfig) { + fullText += token.text + } + let body = "{\"id\":\"\(reqId)\",\"object\":\"chat.completion\",\"created\":\(created),\"model\":\(escapedModelId),\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"\(jsonEscape(fullText))\"},\"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":0,\"completion_tokens\":0,\"total_tokens\":0}}" + return Response(status: .ok, headers: swiftBuddyJSONHeaders, + body: .init(byteBuffer: ByteBuffer(string: body))) + } } let app = Application( diff --git a/SwiftBuddy/SwiftBuddy/Views/ChatView.swift b/SwiftBuddy/SwiftBuddy/Views/ChatView.swift index 8bded67..831fe81 100644 --- a/SwiftBuddy/SwiftBuddy/Views/ChatView.swift +++ b/SwiftBuddy/SwiftBuddy/Views/ChatView.swift @@ -137,19 +137,28 @@ struct ChatView: View { case .downloading(let progress, let speed): DownloadAnimationView(progress: progress, speed: speed) - case .loading: + case .loading(let progress, let stage): VStack(spacing: 16) { ZStack { Circle() .stroke(SwiftBuddyTheme.accent.opacity(0.15), lineWidth: 3) .frame(width: 64, height: 64) - ProgressView() + ProgressView(value: progress) .controlSize(.large) .tint(SwiftBuddyTheme.accent) + .frame(width: 64) + } + VStack(spacing: 4) { + Text("Loading model into Metal GPU…") + .font(.subheadline) + .foregroundStyle(SwiftBuddyTheme.textSecondary) + Text(stage) + .font(.caption) + .foregroundStyle(SwiftBuddyTheme.textTertiary) + Text("\(Int(progress * 100))%") + .font(.caption.monospacedDigit()) + .foregroundStyle(SwiftBuddyTheme.textTertiary) } - Text("Loading model into Metal GPU…") - .font(.subheadline) - .foregroundStyle(SwiftBuddyTheme.textSecondary) } case .idle: @@ -252,13 +261,18 @@ struct ChatView: View { switch engine.state { case .idle: bannerRow(icon: "cpu", text: "No model loaded", color: SwiftBuddyTheme.textTertiary) - case .loading: - HStack(spacing: 8) { - ProgressView().controlSize(.mini).tint(SwiftBuddyTheme.accent) - Text("Loading model…") - .font(.caption) - .foregroundStyle(SwiftBuddyTheme.textSecondary) - Spacer() + case .loading(let progress, let stage): + VStack(alignment: .leading, spacing: 4) { + HStack { + Text(stage) + .font(.caption.weight(.medium)) + .foregroundStyle(SwiftBuddyTheme.textSecondary) + Spacer() + Text("\(Int(progress * 100))%") + .font(.caption2.monospacedDigit()) + .foregroundStyle(SwiftBuddyTheme.textTertiary) + } + ProgressView(value: progress).tint(SwiftBuddyTheme.accent) } .padding(.horizontal, 16) .padding(.vertical, 8) @@ -527,7 +541,7 @@ extension ModelState { var shortLabel: String { switch self { case .idle: return "No model" - case .loading: return "Loading…" + case .loading(let progress, _): return "\(Int(progress * 100))% loading" case .downloading(let p, _): return "\(Int(p * 100))% downloading" case .ready(let modelId): return modelId.components(separatedBy: "/").last ?? modelId case .generating: return "Generating" diff --git a/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift b/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift index ac0cfc1..ad96882 100644 --- a/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift +++ b/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift @@ -277,8 +277,8 @@ private struct ActiveModelCardView: View { entry: engine.loadedModelId.flatMap { id in ModelCatalog.all.first(where: { $0.id == id }) }, state: engine.state ) - case .loading: - loadingCard + case .loading(let progress, let stage): + loadingCard(progress: progress, stage: stage) case .downloading(let progress, let speed): downloadingCard(progress: progress, speed: speed) case .idle, .error: @@ -287,18 +287,24 @@ private struct ActiveModelCardView: View { } } - private var loadingCard: some View { - HStack(spacing: 12) { - ProgressView().controlSize(.regular).tint(SwiftBuddyTheme.accent) - VStack(alignment: .leading, spacing: 2) { - Text("Loading model…") - .font(.subheadline.weight(.semibold)) - .foregroundStyle(SwiftBuddyTheme.textPrimary) - Text("Initializing Metal GPU") - .font(.caption) + private func loadingCard(progress: Double, stage: String) -> some View { + VStack(alignment: .leading, spacing: 10) { + HStack { + ProgressView().controlSize(.regular).tint(SwiftBuddyTheme.accent) + VStack(alignment: .leading, spacing: 2) { + Text("Loading model…") + .font(.subheadline.weight(.semibold)) + .foregroundStyle(SwiftBuddyTheme.textPrimary) + Text(stage) + .font(.caption) + .foregroundStyle(SwiftBuddyTheme.textSecondary) + } + Spacer() + Text("\(Int(progress * 100))%") + .font(.caption.monospacedDigit()) .foregroundStyle(SwiftBuddyTheme.textSecondary) } - Spacer() + ProgressView(value: progress).tint(SwiftBuddyTheme.accent) } .padding() .glassCard(cornerRadius: SwiftBuddyTheme.radiusLarge) diff --git a/SwiftBuddy/SwiftBuddy/Views/RootView.swift b/SwiftBuddy/SwiftBuddy/Views/RootView.swift index 049e4b3..efa301a 100644 --- a/SwiftBuddy/SwiftBuddy/Views/RootView.swift +++ b/SwiftBuddy/SwiftBuddy/Views/RootView.swift @@ -26,6 +26,7 @@ struct RootView: View { @State private var showTextIngestion = false @State private var showModelManagement = false @State private var lastDownloadLogBucket: Int? + @State private var lastLoadingStage: String? enum Tab { case chat, models, palace, mindPalace, miner, settings } var body: some View { @@ -72,11 +73,16 @@ struct RootView: View { switch newState { case .idle: lastDownloadLogBucket = nil + lastLoadingStage = nil ConsoleLog.shared.info("Engine idle — no model loaded") - case .loading: + case .loading(_, let stage): lastDownloadLogBucket = nil - ConsoleLog.shared.info("Loading model…") + if lastLoadingStage != stage { + lastLoadingStage = stage + ConsoleLog.shared.info(stage) + } case .downloading(let p, let speed): + lastLoadingStage = nil let percent = Int(p * 100) let bucket = min((percent / 25) * 25, 100) if bucket != lastDownloadLogBucket, [0, 25, 50, 75, 100].contains(bucket) { @@ -85,12 +91,15 @@ struct RootView: View { } case .ready(let modelId): lastDownloadLogBucket = nil + lastLoadingStage = nil ConsoleLog.shared.info("✓ Model ready: \(modelId)") case .generating: lastDownloadLogBucket = nil + lastLoadingStage = nil ConsoleLog.shared.debug("Generating…") case .error(let msg): lastDownloadLogBucket = nil + lastLoadingStage = nil ConsoleLog.shared.error("Engine error: \(msg)") } } @@ -430,12 +439,12 @@ struct RootView: View { .tint(SwiftBuddyTheme.accent) .controlSize(.small) - case .loading: - HStack(spacing: 6) { - ProgressView().controlSize(.mini).tint(SwiftBuddyTheme.accent) - Text("Loading…") - .font(.caption) - .foregroundStyle(SwiftBuddyTheme.textSecondary) + case .loading(let progress, let stage): + VStack(alignment: .leading, spacing: 4) { + ProgressView(value: progress).tint(SwiftBuddyTheme.accent) + Text("\(Int(progress * 100))% · \(stage)") + .font(.caption2.monospacedDigit()) + .foregroundStyle(SwiftBuddyTheme.textTertiary) } case .downloading(let progress, let speed): diff --git a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift index 9f6f5e7..e54bad2 100644 --- a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift +++ b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift @@ -18,6 +18,10 @@ struct SettingsView: View { @State private var selectedTab: SettingsTab = .generation @State private var draftServerConfiguration = ServerStartupConfiguration.load() @State private var showRestartNotification = false + @State private var endpointCopied = false + @State private var showAppliedBadge = false + @State private var toastHideWork: DispatchWorkItem? = nil + @State private var cliCopied = false @State private var serverSaveMessage = "Server settings saved" @State private var restartNotificationRequiresAction = false @@ -28,18 +32,36 @@ struct SettingsView: View { Double(ProcessInfo.processInfo.physicalMemory) / (1024 * 1024 * 1024) } + private var currentModelIsMoE: Bool { + guard case .ready(let modelId) = engine.state else { return false } + return ModelCatalog.all.first(where: { $0.id == modelId })?.isMoE ?? false + } + + private var effectiveStreamExpertsSetting: Bool { + viewModel.config.effectiveStreamExperts(defaultingTo: currentModelIsMoE) + } + + private var ssdStreamingBinding: Binding { + Binding( + get: { effectiveStreamExpertsSetting }, + set: { viewModel.config.streamExperts = $0 } + ) + } + enum SettingsTab: String, CaseIterable { case generation = "Generation" case engine = "Engine" + case appearance = "Appearance" case console = "Console" case about = "About" var icon: String { switch self { - case .generation: return "slider.horizontal.3" - case .engine: return "cpu" - case .console: return "terminal" - case .about: return "info.circle" + case .generation: return "slider.horizontal.3" + case .engine: return "cpu" + case .appearance: return "paintpalette" + case .console: return "terminal" + case .about: return "info.circle" } } } @@ -62,6 +84,8 @@ struct SettingsView: View { generationTab case .engine: engineTab + case .appearance: + appearanceTab case .console: consoleTab case .about: @@ -190,13 +214,16 @@ struct SettingsView: View { parameterCard("Output") { sliderRow( - label: "Max Tokens", icon: "text.word.spacing", + label: "Max Response Tokens", icon: "text.word.spacing", value: Binding( get: { Double(viewModel.config.maxTokens) }, set: { viewModel.config.maxTokens = Int($0) } ), - range: 128...max(16384.0, Double(engine.maxContextWindow)), step: 128, format: "%.0f", - tint: SwiftBuddyTheme.accent + range: 128...16384.0, step: 128, format: "%.0f", + tint: SwiftBuddyTheme.accent, + hint: engine.maxContextWindow > 0 + ? "Max output per reply. Model context window: \(engine.maxContextWindow / 1000)K tokens" + : "Max tokens generated per response (context window shown once model loads)" ) sliderRow( label: "Repetition Penalty", icon: "repeat.circle", @@ -208,6 +235,48 @@ struct SettingsView: View { tint: SwiftBuddyTheme.success, hint: "Higher = less repeating, 1.0 = disabled" ) + + // Seed — optional reproducibility + HStack { + Label("Seed", systemImage: "number") + .foregroundStyle(SwiftBuddyTheme.textPrimary) + .font(.callout) + Spacer() + if let seed = viewModel.config.seed { + Text("\(seed)") + .foregroundStyle(SwiftBuddyTheme.textSecondary) + .font(.callout.monospacedDigit()) + Stepper("", value: Binding( + get: { Int(min(seed, UInt64(Int.max))) }, + set: { viewModel.config.seed = UInt64($0) } + ), in: 0...Int.max) + .labelsHidden() + Button { + viewModel.config.seed = nil + } label: { + Image(systemName: "xmark.circle.fill") + .foregroundStyle(SwiftBuddyTheme.textTertiary) + } + .buttonStyle(.plain) + } else { + Text("Random") + .foregroundStyle(SwiftBuddyTheme.textTertiary) + .font(.callout) + Button { + viewModel.config.seed = UInt64.random(in: 0...UInt64(Int.max)) + } label: { + Image(systemName: "lock.fill") + .foregroundStyle(SwiftBuddyTheme.accent) + } + .buttonStyle(.plain) + } + } + .padding(.vertical, 2) + if viewModel.config.seed != nil { + Text("Fixed seed — same input will produce identical output") + .font(.caption2) + .foregroundStyle(SwiftBuddyTheme.textTertiary) + } } parameterCard("Reasoning") { @@ -219,6 +288,21 @@ struct SettingsView: View { ) } + parameterCard("Performance") { + toggleRow( + label: "SSD Streaming", icon: "internaldrive", + isOn: ssdStreamingBinding, + tint: SwiftBuddyTheme.warning, + hint: "Stream MoE expert weights from NVMe (requires model reload)" + ) + toggleRow( + label: "TurboQuant KV", icon: "bolt.badge.clock", + isOn: $viewModel.config.turboKV, + tint: SwiftBuddyTheme.success, + hint: "3-bit KV compression for massive context windows" + ) + } + parameterCard("System Prompt") { TextEditor(text: $viewModel.systemPrompt) .frame(minHeight: 80) @@ -235,8 +319,8 @@ struct SettingsView: View { // Reset button Button(role: .destructive) { - viewModel.config = .default - viewModel.systemPrompt = "" + viewModel.config = .default // didSet triggers config.save() + viewModel.systemPrompt = "" // didSet clears UserDefaults key } label: { HStack { Image(systemName: "arrow.counterclockwise") @@ -256,6 +340,39 @@ struct SettingsView: View { } .padding(.top, 8) } + // Generation params are hot-applied per request — no restart needed. + // Flash a brief badge so the user knows the change was captured. + .onChange(of: viewModel.config.temperature) { flashApplied() } + .onChange(of: viewModel.config.topP) { flashApplied() } + .onChange(of: viewModel.config.topK) { flashApplied() } + .onChange(of: viewModel.config.minP) { flashApplied() } + .onChange(of: viewModel.config.maxTokens) { flashApplied() } + .onChange(of: viewModel.config.repetitionPenalty) { flashApplied() } + .onChange(of: viewModel.config.enableThinking) { flashApplied() } + .onChange(of: viewModel.config.kvBits) { flashApplied() } + .onChange(of: viewModel.config.prefillSize) { flashApplied() } + .onChange(of: viewModel.config.seed) { flashApplied() } + .overlay(alignment: .top) { + if showAppliedBadge { + HStack(spacing: 6) { + Image(systemName: "checkmark.circle.fill") + .foregroundStyle(SwiftBuddyTheme.success) + .font(.caption) + Text("Applied — takes effect on next message") + .font(.caption.weight(.medium)) + .foregroundStyle(SwiftBuddyTheme.textPrimary) + } + .padding(.horizontal, 14) + .padding(.vertical, 8) + .background(.ultraThinMaterial) + .background(SwiftBuddyTheme.success.opacity(0.12)) + .clipShape(Capsule()) + .overlay(Capsule().strokeBorder(SwiftBuddyTheme.success.opacity(0.3), lineWidth: 1)) + .padding(.top, 8) + .transition(.move(edge: .top).combined(with: .opacity)) + .animation(.easeInOut(duration: 0.2), value: showAppliedBadge) + } + } } // MARK: — Engine Tab @@ -264,14 +381,57 @@ struct SettingsView: View { ScrollView { VStack(spacing: 16) { parameterCard("Local API Server") { - HStack { - Label(server.isOnline ? "Online" : "Offline", systemImage: "network") - .foregroundStyle(server.isOnline ? SwiftBuddyTheme.success : SwiftBuddyTheme.textSecondary) - .font(.callout.weight(.medium)) - Spacer() - Text("\(server.host):\(server.port)") - .foregroundStyle(SwiftBuddyTheme.textSecondary) - .font(.callout.monospacedDigit()) + // ── Endpoint URL card (tap to copy) ───────────────────── + let endpointURL = "http://\(server.host):\(server.port)" + Button { + copyEndpoint(endpointURL) + } label: { + HStack(spacing: 12) { + // Status dot + Circle() + .fill(server.isOnline ? SwiftBuddyTheme.success : SwiftBuddyTheme.textTertiary) + .frame(width: 8, height: 8) + .shadow(color: server.isOnline ? SwiftBuddyTheme.success.opacity(0.6) : .clear, + radius: 4) + + VStack(alignment: .leading, spacing: 2) { + Text(server.isOnline ? "Online" : "Offline") + .font(.caption2.weight(.semibold)) + .foregroundStyle(server.isOnline ? SwiftBuddyTheme.success : SwiftBuddyTheme.textTertiary) + Text(endpointURL) + .font(.system(.callout, design: .monospaced)) + .foregroundStyle(SwiftBuddyTheme.textPrimary) + } + + Spacer() + + // Copy / confirm icon + Image(systemName: endpointCopied ? "checkmark" : "doc.on.doc") + .font(.caption) + .foregroundStyle(endpointCopied ? SwiftBuddyTheme.success : SwiftBuddyTheme.textTertiary) + .animation(.easeInOut(duration: 0.2), value: endpointCopied) + } + .padding(12) + .frame(maxWidth: .infinity) + .background(SwiftBuddyTheme.background.opacity(0.6)) + .clipShape(RoundedRectangle(cornerRadius: 10)) + .overlay( + RoundedRectangle(cornerRadius: 10) + .strokeBorder( + server.isOnline + ? SwiftBuddyTheme.success.opacity(0.3) + : Color.white.opacity(0.07), + lineWidth: 1 + ) + ) + } + .buttonStyle(.plain) + + // Quick-use hint for external tools + if server.isOnline { + Text("Compatible with OpenAI SDK, LM Studio, Continue, Cursor") + .font(.caption2) + .foregroundStyle(SwiftBuddyTheme.textTertiary) } toggleRow( @@ -376,22 +536,92 @@ struct SettingsView: View { ) } - parameterCard("Appearance") { - HStack { - Label("Color Scheme", systemImage: "paintpalette") - .foregroundStyle(SwiftBuddyTheme.textPrimary) - .font(.callout) - Spacer() - } - Picker("", selection: $appearance.preference) { - HStack { Image(systemName: "moon.fill"); Text("Dark") }.tag("dark") - HStack { Image(systemName: "sun.max.fill"); Text("Light") }.tag("light") - HStack { Image(systemName: "circle.lefthalf.filled"); Text("System") }.tag("system") + parameterCard("Advanced Engine") { + // ── TurboKV (per-request, no reload needed) ────────────────────────── + toggleRow( + label: "TurboKV Compression", icon: "memorychip", + isOn: $viewModel.config.turboKV, + tint: SwiftBuddyTheme.warning, + hint: "3-bit PolarQuant+QJL compression for KV history >8K tokens. Halves long-context RAM — applied per request" + ) + + Divider().background(SwiftBuddyTheme.divider) + + // ── SSD Expert Streaming (load-time — shows reload prompt) ──── + VStack(alignment: .leading, spacing: 6) { + toggleRow( + label: "SSD Expert Streaming", icon: "externaldrive.fill", + isOn: ssdStreamingBinding, + tint: SwiftBuddyTheme.accentSecondary, + hint: "mmap expert weights from NVMe — only active expert pages stay in RAM. Auto-enabled for MoE catalog models." + ) + if effectiveStreamExpertsSetting != currentModelIsMoE { + VStack(alignment: .leading, spacing: 8) { + HStack(spacing: 6) { + Image(systemName: "arrow.clockwise.circle.fill") + .foregroundStyle(SwiftBuddyTheme.warning) + .font(.caption) + Text("Reload model to apply this change") + .font(.caption2.weight(.medium)) + .foregroundStyle(SwiftBuddyTheme.warning) + Spacer() + Button("Reload") { + let currentId: String? = { + if case .ready(let id) = engine.state { return id } + return nil + }() + if let id = currentId { + Task { + engine.unload() + await engine.load(modelId: id) + } + } + } + .font(.caption2.weight(.semibold)) + .foregroundStyle(SwiftBuddyTheme.accent) + .buttonStyle(.plain) + } + + switch engine.state { + case .loading(let progress, let stage): + VStack(alignment: .leading, spacing: 4) { + HStack { + Text(stage) + .font(.caption2.weight(.medium)) + .foregroundStyle(SwiftBuddyTheme.textSecondary) + Spacer() + Text("\(Int(progress * 100))%") + .font(.caption2.monospacedDigit()) + .foregroundStyle(SwiftBuddyTheme.textTertiary) + } + ProgressView(value: progress) + .tint(SwiftBuddyTheme.accent) + } + case .downloading(let progress, let speed): + VStack(alignment: .leading, spacing: 4) { + HStack { + Text("Downloading model files") + .font(.caption2.weight(.medium)) + .foregroundStyle(SwiftBuddyTheme.textSecondary) + Spacer() + Text("\(Int(progress * 100))% · \(speed)") + .font(.caption2.monospacedDigit()) + .foregroundStyle(SwiftBuddyTheme.textTertiary) + } + ProgressView(value: progress) + .tint(SwiftBuddyTheme.accent) + } + default: + EmptyView() + } + } + .padding(.horizontal, 4) + .padding(.vertical, 6) + .background(SwiftBuddyTheme.warning.opacity(0.08)) + .clipShape(RoundedRectangle(cornerRadius: 8)) + } } - .pickerStyle(.segmented) - .tint(SwiftBuddyTheme.accent) } - #if os(iOS) parameterCard("iOS Performance") { toggleRow( @@ -414,12 +644,80 @@ struct SettingsView: View { } #endif + // ── CLI Equivalent ────────────────────────────────────────── + parameterCard("CLI Equivalent") { + Text("Run standalone server with these settings:") + .font(.caption2) + .foregroundStyle(SwiftBuddyTheme.textTertiary) + + ScrollView(.horizontal, showsIndicators: false) { + Text(cliCommand) + .font(.system(size: 11, design: .monospaced)) + .foregroundStyle(SwiftBuddyTheme.textSecondary) + .textSelection(.enabled) + .padding(.vertical, 6) + } + + Button { + copyCLI() + } label: { + Label( + cliCopied ? "Copied!" : "Copy Command", + systemImage: cliCopied ? "checkmark" : "doc.on.doc" + ) + .font(.caption.weight(.medium)) + .frame(maxWidth: .infinity) + } + .buttonStyle(.bordered) + .tint(cliCopied ? SwiftBuddyTheme.success : SwiftBuddyTheme.accent) + .animation(.easeInOut(duration: 0.2), value: cliCopied) + } Spacer(minLength: 20) } .padding(.top, 8) } } + // MARK: - Appearance Tab + + // Use local state for the picker to avoid triggering a @Published write + // directly from within a view update cycle, which causes the crash: + // "Publishing changes from within view updates is not allowed" + @State private var localColorScheme: String = "dark" + + private var appearanceTab: some View { + ScrollView { + VStack(spacing: 16) { + parameterCard("Theme") { + HStack { + Label("Color Scheme", systemImage: "paintpalette") + .foregroundStyle(SwiftBuddyTheme.textPrimary) + .font(.callout) + Spacer() + } + Picker("", selection: $localColorScheme) { + Text("Dark").tag("dark") + Text("Light").tag("light") + Text("System").tag("system") + } + .pickerStyle(.segmented) + .tint(SwiftBuddyTheme.accent) + .onChange(of: localColorScheme) { newValue in + // Defer the @Published write to avoid the view update crash + Task { @MainActor in + appearance.preference = newValue + } + } + } + } + .padding(.horizontal, 16) + .padding(.bottom, 24) + } + .onAppear { + localColorScheme = appearance.preference + } + } + // MARK: — Console Tab private var consoleTab: some View { @@ -599,6 +897,59 @@ struct SettingsView: View { .shadow(color: .black.opacity(0.18), radius: 14, y: 6) } + private func flashApplied() { + withAnimation { showAppliedBadge = true } + // Cancel any pending hide before scheduling a new one to prevent + // stacked closures from causing flicker when sliders are dragged rapidly. + toastHideWork?.cancel() + let work = DispatchWorkItem { + withAnimation { showAppliedBadge = false } + } + toastHideWork = work + DispatchQueue.main.asyncAfter(deadline: .now() + 2, execute: work) + } + + /// Build the equivalent `swift run SwiftLM` command from current settings. + private var cliCommand: String { + buildCLICommand( + config: viewModel.config, + host: server.host, + port: server.port, + parallel: server.startupConfiguration.parallelSlots, + apiKeySet: !server.startupConfiguration.apiKey.isEmpty, + modelId: { + if case .ready(let id) = engine.state { return id } + return nil + }() + ) + } + + private func copyCLI() { + #if os(macOS) + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(cliCommand, forType: .string) + #else + UIPasteboard.general.string = cliCommand + #endif + withAnimation { cliCopied = true } + DispatchQueue.main.asyncAfter(deadline: .now() + 2) { + withAnimation { cliCopied = false } + } + } + + private func copyEndpoint(_ url: String) { + #if os(macOS) + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(url, forType: .string) + #else + UIPasteboard.general.string = url + #endif + withAnimation { endpointCopied = true } + DispatchQueue.main.asyncAfter(deadline: .now() + 2) { + withAnimation { endpointCopied = false } + } + } + private func saveServerConfiguration() { let changed = server.saveStartupConfiguration(draftServerConfiguration) draftServerConfiguration = server.startupConfiguration diff --git a/tests/SwiftLMTests/ThinkingTagStripTests.swift b/tests/SwiftLMTests/ThinkingTagStripTests.swift new file mode 100644 index 0000000..b258f44 --- /dev/null +++ b/tests/SwiftLMTests/ThinkingTagStripTests.swift @@ -0,0 +1,157 @@ +// ThinkingTagStripTests.swift — Regression tests for Issue #97 +// +// Verifies two fixes: +// 1. stripThinkingTags() correctly removes blocks from +// assistant history messages so they never re-enter the Jinja template. +// 2. The role mapping for "assistant" is NOT changed to "model" (Qwen3 fix). +// +// stripThinkingTags is private at file scope in InferenceEngine.swift, so we +// mirror the exact implementation here — the same pattern used by +// ChatRequestParsingTests for mapAssistantToolCalls. + +import XCTest +import Foundation +@testable import SwiftLM +import MLXInferenceCore + +final class ThinkingTagStripTests: XCTestCase { + + // ── Mirror of the production helper (InferenceEngine.swift) ─────────────── + // Keep in sync if the production implementation changes. + + private func stripThinkingTags(from text: String) -> String { + var result = text + while let openRange = result.range(of: "") { + if let closeRange = result.range(of: "", range: openRange.lowerBound.. sections in one reply + let input = "first\nVisible A\nsecond\nVisible B" + XCTAssertEqual(stripThinkingTags(from: input), "Visible A\nVisible B") + } + + // ═══════════════════════════════════════════════════════════════════ + // MARK: - 2. Edge cases + // ═══════════════════════════════════════════════════════════════════ + + func testStrip_UnclosedThinkTag_StripsToEndOfString() { + // If generation was interrupted mid-think, the closing tag may be absent. + let input = "Visible prefix\nreasoning that never closed" + XCTAssertEqual(stripThinkingTags(from: input), "Visible prefix") + } + + func testStrip_EmptyThinkBlock_RemovesTagsOnly() { + let input = "The actual answer." + XCTAssertEqual(stripThinkingTags(from: input), "The actual answer.") + } + + func testStrip_MultilineThinkBlock() { + let input = """ + + Line one of reasoning. + Line two of reasoning. + + The final answer. + """ + XCTAssertEqual(stripThinkingTags(from: input), "The final answer.") + } + + func testStrip_ThinkBlockWithTrailingNewline_ConsumesNewline() { + // The production helper eats the single newline after + // so the visible content doesn't start with a blank line. + let input = "thought\nAnswer starts here" + let result = stripThinkingTags(from: input) + XCTAssertFalse(result.hasPrefix("\n"), "Result must not start with a stray newline") + XCTAssertEqual(result, "Answer starts here") + } + + func testStrip_ContentBeforeAndAfterThink() { + // Reproduces the exact shape of Qwen3 output with thinking ON: + // the UI shows the block inline and the answer follows. + let input = "\nThe user is asking me to continue a Russian tongue-twister.\nNo tool calls needed.\n\nЕхал грека через реку,\nВидит грека — в реке рак." + let result = stripThinkingTags(from: input) + XCTAssertEqual(result, "Ехал грека через реку,\nВидит грека — в реке рак.") + } + + // ═══════════════════════════════════════════════════════════════════ + // MARK: - 3. Issue #97 crash reproducer + // ═══════════════════════════════════════════════════════════════════ + + func testStrip_Issue97_SecondTurnMessageShape() { + // This is the exact assistant content that caused TemplateException error 1 + // when fed back unmodified into the Jinja template on turn 2. + let turn1AssistantOutput = """ + + The user said "Hi!" as a greeting. Let me check my available tools and context. \ + No tool calls needed here — just a simple greeting. + + Hello! 👋 It's great to meet you. How can I assist you today? + """ + let stripped = stripThinkingTags(from: turn1AssistantOutput) + + // After stripping, no tag should remain + XCTAssertFalse(stripped.contains(""), "Stripped content must not contain ") + XCTAssertFalse(stripped.contains(""), "Stripped content must not contain ") + + // The visible reply must be preserved + XCTAssertTrue(stripped.contains("Hello!"), "Visible reply must survive stripping") + } + + // ═══════════════════════════════════════════════════════════════════ + // MARK: - 4. Role mapping regression guard (Issue #97) + // ═══════════════════════════════════════════════════════════════════ + // The ChatCompletionRequest pipeline in Server.swift passes roles through + // as-is. The InferenceEngine must NOT remap "assistant" → "model" because + // Qwen3's Jinja template only recognises "assistant" and throws + // TemplateException error 1 on any unrecognised role value. + + func testRoleMapping_AssistantRawValue_IsAssistant() { + // ChatMessage.Role.assistant.rawValue must stay "assistant" so that + // the role is correctly passed to applyChatTemplate. + // If someone changes the enum rawValue, this test fails loudly. + XCTAssertEqual( + ChatMessage.Role.assistant.rawValue, + "assistant", + "Role.assistant rawValue must be 'assistant', not 'model' — Qwen3 Jinja template fix (Issue #97)" + ) + } + + func testRoleMapping_AllRolesHaveExpectedRawValues() { + // Canonical role strings for the OpenAI-compatible message protocol. + XCTAssertEqual(ChatMessage.Role.system.rawValue, "system") + XCTAssertEqual(ChatMessage.Role.user.rawValue, "user") + XCTAssertEqual(ChatMessage.Role.assistant.rawValue, "assistant") + XCTAssertEqual(ChatMessage.Role.tool.rawValue, "tool") + } +}