Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ let package = Package(
),
.testTarget(
name: "SwiftLMTests",
dependencies: ["SwiftLM"]
dependencies: ["SwiftLM", "MLXInferenceCore"]
)
]
)
68 changes: 68 additions & 0 deletions Sources/MLXInferenceCore/CLICommandBuilder.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// CLICommandBuilder.swift — Pure function for building the equivalent CLI command
// Lives in MLXInferenceCore so it can be unit-tested by SwiftLMTests without
// requiring the SwiftBuddy app target.
import Foundation

/// Builds the equivalent `swift run SwiftLM` command string from persisted settings.
/// Only emits flags that differ from the CLI defaults, keeping the command readable.
///
/// - Parameters:
/// - config: The current `GenerationConfig`.
/// - host: The server host string (e.g. "127.0.0.1").
/// - port: The server port (e.g. 5413).
/// - parallel: Number of parallel request slots (default 1).
/// - apiKeySet: `true` if an API key is configured (key itself is redacted).
/// - modelId: The currently loaded model ID, or `nil` when no model is loaded.
/// - Returns: A multi-line shell command string suitable for display and copy.
public func buildCLICommand(
config: GenerationConfig,
host: String,
port: Int,
parallel: Int,
apiKeySet: Bool,
modelId: String?
) -> String {
var parts: [String] = []

parts.append("--model \(modelId ?? "<model-id>")")
parts.append("--host \(host)")
parts.append("--port \(port)")
parts.append("--max-tokens \(config.maxTokens)")
parts.append("--temp \(String(format: "%.2f", config.temperature))")

if config.topP < 1.0 {
parts.append("--top-p \(String(format: "%.2f", config.topP))")
}
if config.topK != 50 {
parts.append("--top-k \(config.topK)")
}
if config.minP > 0 {
parts.append("--min-p \(String(format: "%.2f", config.minP))")
}
if config.repetitionPenalty != 1.05 {
parts.append("--repeat-penalty \(String(format: "%.2f", config.repetitionPenalty))")
}
if config.prefillSize != 512 {
parts.append("--prefill-size \(config.prefillSize)")
}
if let kvBits = config.kvBits {
parts.append("--kv-bits \(kvBits)")
if config.kvGroupSize != 64 {
parts.append("--kv-group-size \(config.kvGroupSize)")
}
}
if config.enableThinking {
parts.append("--thinking")
}
if let seed = config.seed {
parts.append("--seed \(seed)")
}
if parallel > 1 {
parts.append("--parallel \(parallel)")
}
if apiKeySet {
parts.append("--api-key <redacted>")
}

return "swift run SwiftLM " + parts.joined(separator: " \\\n ")
}
79 changes: 64 additions & 15 deletions Sources/MLXInferenceCore/GenerationConfig.swift
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
// GenerationConfig.swift — SwiftLM inference parameters
import Foundation

/// Configuration for a single generation request.
public struct GenerationConfig: Sendable {
/// Per-request generation parameters, persisted across app launches via UserDefaults.
///
/// ### Field classification
/// **Per-request** (applied on every `generate()` call — no reload needed):
/// temperature, topP, topK, minP, repetitionPenalty, seed, enableThinking,
/// prefillSize, kvBits, kvGroupSize, turboKV
///
/// **Load-time** (requires model reload to take effect):
/// streamExperts — controls SSD expert streaming for MoE and large models.
/// Stored here for persistence but applied by InferenceEngine at load time.
public struct GenerationConfig: Sendable, Codable {
public var maxTokens: Int
public var temperature: Float
public var topP: Float
public var topK: Int
public var minP: Float
public var repetitionPenalty: Float
public var seed: UInt64?
public var enableThinking: Bool

// ── SwiftLM Engine Parameters ──────────────────────────────────────
/// Enable TurboQuant KV-cache compression (3-bit PolarQuant+QJL).
/// Compresses KV history > 8192 tokens to ~3.5 bits/token.
public var turboKV: Bool
/// Optional RNG seed for reproducible outputs.
/// When non-nil, `MLX.seed(seed)` is called before each generation using this `UInt64` value.
public var seed: UInt64?

/// Enable SSD expert streaming for MoE models.
public var streamExperts: Bool
public var enableThinking: Bool

/// Chunk size for prefill evaluation.
/// Lower values prevent GPU timeout on large models.
Expand All @@ -30,6 +35,21 @@ public struct GenerationConfig: Sendable {
/// KV-cache quantization group size (default 64).
public var kvGroupSize: Int

/// Enable 3-bit TurboQuant KV-cache compression (PolarQuant+QJL).
/// Compresses KV history older than 8192 tokens to ~3.5 bits/token.
/// Recommended for 100k+ context to halve KV RAM usage.
/// Applied per-request — no model reload needed.
public var turboKV: Bool

/// Enable SSD expert streaming for MoE (and any large) models.
/// When true, expert weights are mmap'd from NVMe and only active
/// expert pages reside in RAM during inference (Flash-MoE style).
/// ⚠️ LOAD-TIME flag: changes take effect on the next model load.
/// MoE models (isMoE == true) default to true automatically;
/// this flag lets users override that for non-catalog models or
/// force-disable streaming even on MoE models.
public var streamExperts: Bool

public init(
maxTokens: Int = 2048,
temperature: Float = 0.6,
Expand All @@ -39,11 +59,11 @@ public struct GenerationConfig: Sendable {
repetitionPenalty: Float = 1.05,
seed: UInt64? = nil,
enableThinking: Bool = false,
turboKV: Bool = false,
streamExperts: Bool = false,
prefillSize: Int = 512,
kvBits: Int? = nil,
kvGroupSize: Int = 64
kvGroupSize: Int = 64,
turboKV: Bool = false,
streamExperts: Bool = false
) {
self.maxTokens = maxTokens
self.temperature = temperature
Expand All @@ -53,12 +73,41 @@ public struct GenerationConfig: Sendable {
self.repetitionPenalty = repetitionPenalty
self.seed = seed
self.enableThinking = enableThinking
self.turboKV = turboKV
self.streamExperts = streamExperts
self.prefillSize = prefillSize
self.kvBits = kvBits
self.kvGroupSize = kvGroupSize
self.turboKV = turboKV
self.streamExperts = streamExperts
}

public static let `default` = GenerationConfig()

// MARK: — Persistence

private static let storageKey = "swiftlm.generationConfig"

/// True when the user has previously saved a GenerationConfig.
/// Used to distinguish the first-run/default state from an explicit choice.
public static var hasPersistedConfig: Bool {
UserDefaults.standard.object(forKey: storageKey) != nil
}

/// Computes the effective SSD streaming setting.
/// Before the user has saved settings, MoE models default to streaming on.
/// After settings are persisted, the saved toggle becomes authoritative.
public func effectiveStreamExperts(defaultingTo defaultValue: Bool) -> Bool {
Self.hasPersistedConfig ? streamExperts : defaultValue
}

public func save() {
guard let data = try? JSONEncoder().encode(self) else { return }
UserDefaults.standard.set(data, forKey: Self.storageKey)
}

public static func load() -> GenerationConfig {
guard let data = UserDefaults.standard.data(forKey: storageKey),
let decoded = try? JSONDecoder().decode(GenerationConfig.self, from: data)
else { return .default }
return decoded
}
}
Loading
Loading