SharpAI · solderzzc · May 1, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/Package.swift b/Package.swift
@@ -117,7 +117,7 @@ let package = Package(
         ),
         .testTarget(
             name: "SwiftLMTests",
-            dependencies: ["SwiftLM"]
+            dependencies: ["SwiftLM", "MLXInferenceCore"]
         )
     ]
 )
diff --git a/Sources/MLXInferenceCore/CLICommandBuilder.swift b/Sources/MLXInferenceCore/CLICommandBuilder.swift
@@ -0,0 +1,68 @@
+// CLICommandBuilder.swift — Pure function for building the equivalent CLI command
+// Lives in MLXInferenceCore so it can be unit-tested by SwiftLMTests without
+// requiring the SwiftBuddy app target.
+import Foundation
+
+/// Builds the equivalent `swift run SwiftLM` command string from persisted settings.
+/// Only emits flags that differ from the CLI defaults, keeping the command readable.
+///
+/// - Parameters:
+///   - config:  The current `GenerationConfig`.
+///   - host:    The server host string (e.g. "127.0.0.1").
+///   - port:    The server port (e.g. 5413).
+///   - parallel: Number of parallel request slots (default 1).
+///   - apiKeySet: `true` if an API key is configured (key itself is redacted).
+///   - modelId:  The currently loaded model ID, or `nil` when no model is loaded.
+/// - Returns: A multi-line shell command string suitable for display and copy.
+public func buildCLICommand(
+    config: GenerationConfig,
+    host: String,
+    port: Int,
+    parallel: Int,
+    apiKeySet: Bool,
+    modelId: String?
+) -> String {
+    var parts: [String] = []
+
+    parts.append("--model \(modelId ?? "<model-id>")")
+    parts.append("--host \(host)")
+    parts.append("--port \(port)")
+    parts.append("--max-tokens \(config.maxTokens)")
+    parts.append("--temp \(String(format: "%.2f", config.temperature))")
+
+    if config.topP < 1.0 {
+        parts.append("--top-p \(String(format: "%.2f", config.topP))")
+    }
+    if config.topK != 50 {
+        parts.append("--top-k \(config.topK)")
+    }
+    if config.minP > 0 {
+        parts.append("--min-p \(String(format: "%.2f", config.minP))")
+    }
+    if config.repetitionPenalty != 1.05 {
+        parts.append("--repeat-penalty \(String(format: "%.2f", config.repetitionPenalty))")
+    }
+    if config.prefillSize != 512 {
+        parts.append("--prefill-size \(config.prefillSize)")
+    }
+    if let kvBits = config.kvBits {
+        parts.append("--kv-bits \(kvBits)")
+        if config.kvGroupSize != 64 {
+            parts.append("--kv-group-size \(config.kvGroupSize)")
+        }
+    }
+    if config.enableThinking {
+        parts.append("--thinking")
+    }
+    if let seed = config.seed {
+        parts.append("--seed \(seed)")
+    }
+    if parallel > 1 {
+        parts.append("--parallel \(parallel)")
+    }
+    if apiKeySet {
+        parts.append("--api-key <redacted>")
+    }
+
+    return "swift run SwiftLM " + parts.joined(separator: " \\\n  ")
+}
diff --git a/Sources/MLXInferenceCore/GenerationConfig.swift b/Sources/MLXInferenceCore/GenerationConfig.swift
@@ -1,24 +1,29 @@
 // GenerationConfig.swift — SwiftLM inference parameters
 import Foundation
 
-/// Configuration for a single generation request.
-public struct GenerationConfig: Sendable {
+/// Per-request generation parameters, persisted across app launches via UserDefaults.
+///
+/// ### Field classification
+/// **Per-request** (applied on every `generate()` call — no reload needed):
+///   temperature, topP, topK, minP, repetitionPenalty, seed, enableThinking,
+///   prefillSize, kvBits, kvGroupSize, turboKV
+///
+/// **Load-time** (requires model reload to take effect):
+///   streamExperts — controls SSD expert streaming for MoE and large models.
+///   Stored here for persistence but applied by InferenceEngine at load time.
+public struct GenerationConfig: Sendable, Codable {
     public var maxTokens: Int
     public var temperature: Float
     public var topP: Float
     public var topK: Int
     public var minP: Float
     public var repetitionPenalty: Float
-    public var seed: UInt64?
-    public var enableThinking: Bool
 
-    // ── SwiftLM Engine Parameters ──────────────────────────────────────
-    /// Enable TurboQuant KV-cache compression (3-bit PolarQuant+QJL).
-    /// Compresses KV history > 8192 tokens to ~3.5 bits/token.
-    public var turboKV: Bool
+    /// Optional RNG seed for reproducible outputs.
+    /// When non-nil, `MLX.seed(seed)` is called before each generation using this `UInt64` value.
+    public var seed: UInt64?
 
-    /// Enable SSD expert streaming for MoE models.
-    public var streamExperts: Bool
+    public var enableThinking: Bool
 
     /// Chunk size for prefill evaluation.
     /// Lower values prevent GPU timeout on large models.
@@ -30,6 +35,21 @@ public struct GenerationConfig: Sendable {
     /// KV-cache quantization group size (default 64).
     public var kvGroupSize: Int
 
+    /// Enable 3-bit TurboQuant KV-cache compression (PolarQuant+QJL).
+    /// Compresses KV history older than 8192 tokens to ~3.5 bits/token.
+    /// Recommended for 100k+ context to halve KV RAM usage.
+    /// Applied per-request — no model reload needed.
+    public var turboKV: Bool
+
+    /// Enable SSD expert streaming for MoE (and any large) models.
+    /// When true, expert weights are mmap'd from NVMe and only active
+    /// expert pages reside in RAM during inference (Flash-MoE style).
+    /// ⚠️ LOAD-TIME flag: changes take effect on the next model load.
+    /// MoE models (isMoE == true) default to true automatically;
+    /// this flag lets users override that for non-catalog models or
+    /// force-disable streaming even on MoE models.
+    public var streamExperts: Bool
+
     public init(
         maxTokens: Int = 2048,
         temperature: Float = 0.6,
@@ -39,11 +59,11 @@ public struct GenerationConfig: Sendable {
         repetitionPenalty: Float = 1.05,
         seed: UInt64? = nil,
         enableThinking: Bool = false,
-        turboKV: Bool = false,
-        streamExperts: Bool = false,
         prefillSize: Int = 512,
         kvBits: Int? = nil,
-        kvGroupSize: Int = 64
+        kvGroupSize: Int = 64,
+        turboKV: Bool = false,
+        streamExperts: Bool = false
     ) {
         self.maxTokens = maxTokens
         self.temperature = temperature
@@ -53,12 +73,41 @@ public struct GenerationConfig: Sendable {
         self.repetitionPenalty = repetitionPenalty
         self.seed = seed
         self.enableThinking = enableThinking
-        self.turboKV = turboKV
-        self.streamExperts = streamExperts
         self.prefillSize = prefillSize
         self.kvBits = kvBits
         self.kvGroupSize = kvGroupSize
+        self.turboKV = turboKV
+        self.streamExperts = streamExperts
     }
 
     public static let `default` = GenerationConfig()
+
+    // MARK: — Persistence
+
+    private static let storageKey = "swiftlm.generationConfig"
+
+    /// True when the user has previously saved a GenerationConfig.
+    /// Used to distinguish the first-run/default state from an explicit choice.
+    public static var hasPersistedConfig: Bool {
+        UserDefaults.standard.object(forKey: storageKey) != nil
+    }
+
+    /// Computes the effective SSD streaming setting.
+    /// Before the user has saved settings, MoE models default to streaming on.
+    /// After settings are persisted, the saved toggle becomes authoritative.
+    public func effectiveStreamExperts(defaultingTo defaultValue: Bool) -> Bool {
+        Self.hasPersistedConfig ? streamExperts : defaultValue
+    }
+
+    public func save() {
+        guard let data = try? JSONEncoder().encode(self) else { return }
+        UserDefaults.standard.set(data, forKey: Self.storageKey)
+    }
+
+    public static func load() -> GenerationConfig {
+        guard let data = UserDefaults.standard.data(forKey: storageKey),
+              let decoded = try? JSONDecoder().decode(GenerationConfig.self, from: data)
+        else { return .default }
+        return decoded
+    }
 }