From 9274a05323191b16affa7cb11adb7b16029d694c Mon Sep 17 00:00:00 2001
From: Roy Peter D'Souza <roy.peter.dsouza@gmail.com>
Date: Wed, 29 Apr 2026 20:27:12 -0700
Subject: [PATCH 1/2] Local audit remediations: swap monitoring and MoE memory
 budgeting

---
 README.md                                     |  14 -
 Sources/DFlash/DFlashIntermediateDumper.swift |   1 +
 Sources/DFlash/DFlashRuntime.swift            |   2 +-
 .../MLXInferenceCore/GenerationConfig.swift   |  30 +-
 .../MLXInferenceCore/InferenceEngine.swift    | 186 +---
 .../ModelDownloadManager.swift                | 133 +--
 .../MLXInferenceCore/ModelDownloader.swift    | 442 +-------
 Sources/MLXInferenceCore/ModelStorage.swift   | 421 +-------
 Sources/SwiftLM/ModelProfiler.swift           |  54 +-
 Sources/SwiftLM/Server.swift                  |  71 +-
 SwiftBuddy/SwiftBuddy/SwiftBuddyApp.swift     |  10 +-
 .../SwiftBuddy/ViewModels/ServerManager.swift | 178 +---
 SwiftBuddy/SwiftBuddy/Views/ChatView.swift    |  81 +-
 .../SwiftBuddy/Views/InspectorView.swift      |  17 +-
 .../Views/ModelManagementView.swift           |  25 +-
 .../SwiftBuddy/Views/ModelPickerView.swift    |   2 +-
 SwiftBuddy/SwiftBuddy/Views/ModelsView.swift  | 115 +-
 SwiftBuddy/SwiftBuddy/Views/RootView.swift    |  51 +-
 .../SwiftBuddy/Views/SettingsView.swift       | 998 +++++-------------
 SwiftBuddy/generate_xcodeproj.py              |  28 +-
 scripts/profiling/profile_runner.py           |  14 +-
 21 files changed, 509 insertions(+), 2364 deletions(-)

diff --git a/README.md b/README.md
index 6b9d5eb1..3a8d3778 100644
--- a/README.md
+++ b/README.md
@@ -73,20 +73,6 @@ Benchmark results for `gemma-4-26b-a4b-it-4bit` (26B MoE, 4-bit) on M5 Pro 64 GB
 
 > Run `./run_benchmark.sh` to generate these metrics on your own device. (See **Benchmarks & Testing** below).
 
-### Qwen3.6-35B-A3B-UD-MLX-4bit (Full-RAM) — M1 Ultra 64 GB
-
-Benchmark results for full-RAM (no SSD streaming) MoE inference on M1 Ultra. The 3.4× vanilla improvement vs. earlier builds comes from the `needsMoeFlush` gate in `mlx-swift-lm` (see [SwiftLM #84](https://github.com/SharpAI/SwiftLM/issues/84)) — the per-layer GPU sync barrier required for SSD streaming was firing unconditionally on the full-RAM path and flushing MLX's kernel-batching pipeline.
-
-| Configuration | Short (~126 tok) | Medium (~400 tok) | Long (~800 tok) |
-|---|---|---|---|
-| **Vanilla full-GPU** | **61.7 tok/s** | **62.3 tok/s** | **62.1 tok/s** |
-
-> *Hardware:* Apple M1 Ultra, 64 GB unified memory, macOS 26.x. Model ~20 GB on disk, ~21.6 GB resident weight + ~2.1 GB KV at runtime.
-> *Flags:* `--repeat-penalty 1.1 --max-tokens 2000`, `temperature: 0.6`, single-stream `/v1/chat/completions`.
-> *Vanilla baseline before* `needsMoeFlush` *gate (for reference):* 19.2 / 18.1 / 18.3 tok/s — see #84.
-
-> ⚠️ **DFlash on this model is currently unsuitable for production.** DFlash uses pure greedy (`argMax`) decoding regardless of `temperature`, which on Qwen3.6-35B-A3B + the [`z-lab/Qwen3.6-35B-A3B-DFlash`](https://huggingface.co/z-lab/Qwen3.6-35B-A3B-DFlash) draft locks into low-entropy attractors (`"and and and..."`, `"**UMA** **UMA**..."`). Earlier 70 tok/s DFlash numbers were degenerate output that scored high acceptance because draft and target both committed to the same locked-in token. Repetition-penalty mitigation works on some prompts but tanks acceptance on others — the proper fix is stochastic posterior sampling with rejection-based accept ([Leviathan/Chen](https://arxiv.org/abs/2211.17192) formulation), which is a DFlash architecture change tracked at [z-lab/dflash#91](https://github.com/z-lab/dflash/issues/91).
-
 ### DeepSeek-V4-Flash (126 GB, Q3-mixed-gs128-affine) — M5 Pro 64 GB
 
 Model: [`Thump604/DeepSeek-V4-Flash-MLX-Q3-mixed-gs128-affine`](https://huggingface.co/Thump604/DeepSeek-V4-Flash-MLX-Q3-mixed-gs128-affine)
diff --git a/Sources/DFlash/DFlashIntermediateDumper.swift b/Sources/DFlash/DFlashIntermediateDumper.swift
index 1401cb9b..a9802aff 100644
--- a/Sources/DFlash/DFlashIntermediateDumper.swift
+++ b/Sources/DFlash/DFlashIntermediateDumper.swift
@@ -42,6 +42,7 @@ public enum DFlashDumper {
         eval(floatArr)
 
         let shape = (0..<floatArr.ndim).map { floatArr.dim($0) }
+        let totalElements = shape.reduce(1, *)
 
         // Build spec-compliant .npy header: shape must be a Python tuple,
         // spaces pad before the final newline byte.
diff --git a/Sources/DFlash/DFlashRuntime.swift b/Sources/DFlash/DFlashRuntime.swift
index 9c4439aa..3dcb1f68 100644
--- a/Sources/DFlash/DFlashRuntime.swift
+++ b/Sources/DFlash/DFlashRuntime.swift
@@ -329,7 +329,7 @@ public enum DFlashRuntime {
 
         let draftBackend = DFlashDraftBackend()
 
-        let targetCache = makeTargetCache(targetModel: targetModel)
+        var targetCache = makeTargetCache(targetModel: targetModel)
 
         let draftCache = draftBackend.makeCache(
             draftModel: draftModel,
diff --git a/Sources/MLXInferenceCore/GenerationConfig.swift b/Sources/MLXInferenceCore/GenerationConfig.swift
index 9ec41867..e6a7d720 100644
--- a/Sources/MLXInferenceCore/GenerationConfig.swift
+++ b/Sources/MLXInferenceCore/GenerationConfig.swift
@@ -12,24 +12,6 @@ public struct GenerationConfig: Sendable {
     public var seed: UInt64?
     public var enableThinking: Bool
 
-    // ── SwiftLM Engine Parameters ──────────────────────────────────────
-    /// Enable TurboQuant KV-cache compression (3-bit PolarQuant+QJL).
-    /// Compresses KV history > 8192 tokens to ~3.5 bits/token.
-    public var turboKV: Bool
-
-    /// Enable SSD expert streaming for MoE models.
-    public var streamExperts: Bool
-
-    /// Chunk size for prefill evaluation.
-    /// Lower values prevent GPU timeout on large models.
-    public var prefillSize: Int
-
-    /// KV-cache quantization bits (nil = no quantization, 4 or 8 typical).
-    public var kvBits: Int?
-
-    /// KV-cache quantization group size (default 64).
-    public var kvGroupSize: Int
-
     public init(
         maxTokens: Int = 2048,
         temperature: Float = 0.6,
@@ -38,12 +20,7 @@ public struct GenerationConfig: Sendable {
         minP: Float = 0.0,
         repetitionPenalty: Float = 1.05,
         seed: UInt64? = nil,
-        enableThinking: Bool = false,
-        turboKV: Bool = false,
-        streamExperts: Bool = false,
-        prefillSize: Int = 512,
-        kvBits: Int? = nil,
-        kvGroupSize: Int = 64
+        enableThinking: Bool = false
     ) {
         self.maxTokens = maxTokens
         self.temperature = temperature
@@ -53,11 +30,6 @@ public struct GenerationConfig: Sendable {
         self.repetitionPenalty = repetitionPenalty
         self.seed = seed
         self.enableThinking = enableThinking
-        self.turboKV = turboKV
-        self.streamExperts = streamExperts
-        self.prefillSize = prefillSize
-        self.kvBits = kvBits
-        self.kvGroupSize = kvGroupSize
     }
 
     public static let `default` = GenerationConfig()
diff --git a/Sources/MLXInferenceCore/InferenceEngine.swift b/Sources/MLXInferenceCore/InferenceEngine.swift
index 38d5b396..ed1c34ae 100644
--- a/Sources/MLXInferenceCore/InferenceEngine.swift
+++ b/Sources/MLXInferenceCore/InferenceEngine.swift
@@ -114,10 +114,6 @@ public final class InferenceEngine: ObservableObject {
     @Published public private(set) var activeContextTokens: Int = 0
     @Published public private(set) var maxContextWindow: Int = 0
 
-    /// Set when a corrupted/truncated model is detected during inference.
-    /// The UI should observe this and offer to delete & re-download.
-    @Published public var corruptedModelId: String? = nil
-
     /// Whether to automatically unload the model when the app backgrounds
     /// and reload it when returning to foreground.
     /// Defaults to true on iOS (prevents jetsam), false on macOS.
@@ -281,44 +277,7 @@ public final class InferenceEngine: ObservableObject {
             state = .error("Device is too hot. Let it cool before loading a model.")
             return
         }
-        corruptedModelId = nil
-
-        guard ModelStorage.verifyModelIntegrity(for: modelId) else {
-            await downloadThenLoad(modelId: modelId)
-            return
-        }
-
-        await loadVerifiedModel(modelId: modelId)
-    }
-
-    private func downloadThenLoad(modelId: String) async {
-        print("[InferenceEngine] Model \(modelId) is missing or incomplete. Starting download before load.")
-        releaseLoadedModelResources()
-        state = .downloading(progress: 0.0, speed: "Preparing...")
-
-        let task = downloadManager.startDownload(modelId: modelId)
-
-        do {
-            try await task.value
-            state = .downloading(progress: 1.0, speed: "Verifying...")
-
-            guard ModelStorage.verifyModelIntegrity(for: modelId) else {
-                markModelCorrupted(
-                    modelId: modelId,
-                    message: "Model files are incomplete after download. Choose a recovery option."
-                )
-                return
-            }
-
-            await loadVerifiedModel(modelId: modelId)
-        } catch is CancellationError {
-            state = .idle
-        } catch {
-            state = .error("Failed to download \(modelId): \(error.localizedDescription)")
-        }
-    }
 
-    private func loadVerifiedModel(modelId: String) async {
         state = .loading
         currentModelId = modelId
 
@@ -353,21 +312,17 @@ public final class InferenceEngine: ObservableObject {
                 downloader: downloader
             )
 
-            let speedTracker = DownloadSpeedTracker()
-
             if architecture.supportsVision {
                 container = try await VLMModelFactory.shared.loadContainer(
                     from: downloader,
                     using: TransformersTokenizerLoader(),
                     configuration: config
                 ) { [weak self] progress in
-                    speedTracker.record(totalBytes: progress.completedUnitCount)
-                    let smoothedSpeed = speedTracker.speedBytesPerSec
-
                     Task { @MainActor in
                         guard let self else { return }
                         let pct = progress.fractionCompleted
-                        let speedStr = smoothedSpeed
+                        let speedBytesPerSec = progress.userInfo[ProgressUserInfoKey("throughputKey")] as? Double
+                        let speedStr = speedBytesPerSec
                             .map { String(format: "%.1f MB/s", $0 / 1_000_000) } ?? ""
                         self.state = .downloading(progress: pct, speed: speedStr)
 
@@ -375,7 +330,7 @@ public final class InferenceEngine: ObservableObject {
                             modelId: modelId,
                             fractionCompleted: pct,
                             currentFile: "",
-                            speedMBps: smoothedSpeed.map { $0 / 1_000_000 }
+                            speedMBps: speedBytesPerSec.map { $0 / 1_000_000 }
                         ))
                     }
                 }
@@ -385,13 +340,11 @@ public final class InferenceEngine: ObservableObject {
                     using: TransformersTokenizerLoader(),
                     configuration: config
                 ) { [weak self] progress in
-                    speedTracker.record(totalBytes: progress.completedUnitCount)
-                    let smoothedSpeed = speedTracker.speedBytesPerSec
-
                     Task { @MainActor in
                         guard let self else { return }
                         let pct = progress.fractionCompleted
-                        let speedStr = smoothedSpeed
+                        let speedBytesPerSec = progress.userInfo[ProgressUserInfoKey("throughputKey")] as? Double
+                        let speedStr = speedBytesPerSec
                             .map { String(format: "%.1f MB/s", $0 / 1_000_000) } ?? ""
                         self.state = .downloading(progress: pct, speed: speedStr)
 
@@ -399,7 +352,7 @@ public final class InferenceEngine: ObservableObject {
                             modelId: modelId,
                             fractionCompleted: pct,
                             currentFile: "",
-                            speedMBps: smoothedSpeed.map { $0 / 1_000_000 }
+                            speedMBps: speedBytesPerSec.map { $0 / 1_000_000 }
                         ))
                     }
                 }
@@ -408,85 +361,26 @@ public final class InferenceEngine: ObservableObject {
             downloadManager.clearProgress(modelId: modelId)
             downloadManager.lastLoadedModelId = modelId
             downloadManager.refresh()
-
-            // Verify integrity to catch incomplete downloads before marking as ready
-            guard ModelStorage.verifyModelIntegrity(for: modelId) else {
-                throw NSError(domain: "InferenceEngine", code: 1, userInfo: [NSLocalizedDescriptionKey: "Model safetensors files are incomplete. Please delete and re-download."])
-            }
-
-            // Read the model's actual max context length from config.json
-            if let ctxLen = ModelStorage.readMaxContextLength(for: modelId) {
-                self.maxContextWindow = ctxLen
-                print("[InferenceEngine] Model context window: \(ctxLen) tokens")
-            } else {
-                self.maxContextWindow = 8192  // conservative fallback for models without explicit limits
-                print("[InferenceEngine] No explicit context limit found in config.json, defaulting to 8192")
-            }
-
             state = .ready(modelId: modelId)
 
         } catch {
             ExpertStreamingConfig.shared.deactivate()
             downloadManager.clearProgress(modelId: modelId)
             state = .error("Failed to load \(modelId): \(error.localizedDescription)")
-
-            // If the model is incomplete/corrupted, flag it so the UI shows the "Delete & Re-download" button
-            let nsError = error as NSError
-            if nsError.domain == "InferenceEngine" && nsError.code == 1 || Self.isModelCorruptionError(error) {
-                markModelCorrupted(
-                    modelId: modelId,
-                    message: "Model weights are corrupted or incomplete. Choose a recovery option."
-                )
-                return
-            }
-
             container = nil
-            self.maxContextWindow = 0
-            self.activeContextTokens = 0
         }
     }
 
     /// Unload the current model and free all GPU memory.
     public func unload() {
-        releaseLoadedModelResources()
-        corruptedModelId = nil
-        state = .idle
-    }
-
-    private func releaseLoadedModelResources() {
         generationTask?.cancel()
-        generationTask = nil
         container = nil
         currentModelId = nil
-        maxContextWindow = 0
-        activeContextTokens = 0
+        state = .idle
         ExpertStreamingConfig.shared.deactivate()
         MLX.Memory.cacheLimit = 0
     }
 
-    private func markModelCorrupted(modelId: String?, message: String) {
-        let failedModelId = modelId ?? currentModelId
-        releaseLoadedModelResources()
-        state = .error(message)
-        corruptedModelId = failedModelId
-    }
-
-    private static func isModelCorruptionError(_ error: Error) -> Bool {
-        let description = error.localizedDescription.lowercased()
-        return description.contains("ssd streaming")
-            || description.contains("pread")
-            || description.contains("safetensors")
-            || description.contains("corrupt")
-            || description.contains("incomplete")
-    }
-
-    public func clearCorruptionRecovery() {
-        corruptedModelId = nil
-        if case .error = state {
-            state = .idle
-        }
-    }
-
     // MARK: — Generation
 
     public nonisolated func generate(
@@ -528,17 +422,11 @@ public final class InferenceEngine: ObservableObject {
                     }
                     
                     let mlxMessages = finalMessages
-                    var params = GenerateParameters(
-                        maxTokens: config.maxTokens,
-                        kvBits: config.kvBits,
-                        kvGroupSize: config.kvGroupSize,
-                        temperature: config.temperature,
-                        topP: config.topP,
-                        topK: config.topK,
-                        minP: config.minP,
-                        repetitionPenalty: config.repetitionPenalty,
-                        prefillStepSize: config.prefillSize
-                    )
+                    var params = GenerateParameters(temperature: config.temperature)
+                    params.topP = config.topP
+                    params.topK = config.topK
+                    params.minP = config.minP
+                    params.repetitionPenalty = config.repetitionPenalty
                     params.repetitionContextSize = 20
 
                     var thinkingActive = false
@@ -554,7 +442,9 @@ public final class InferenceEngine: ObservableObject {
                     let baseTokens = Int(Double(stringLength) / 3.5)
                     self.activeContextTokens = baseTokens
                     
-                    // maxContextWindow is already set during loadModel() from config.json
+                    // If we have a max length config, expose it
+                    // TODO: Safely extract from ModelConfiguration when MLX exposes it dynamically
+                    self.maxContextWindow = 8192
                     
                     let stream: AsyncStream<Generation> = try await container.generate(
                         input: lmInput,
@@ -595,30 +485,11 @@ public final class InferenceEngine: ObservableObject {
                             continuation.yield(GenerationToken(text: text, isThinking: thinkingActive))
                         }
                     }
-                } catch let ssdError as SSDStreamingError {
-                    // Corrupted/truncated safetensors — surface a clear, actionable error
-                    let msg = "Model weights are corrupted or incomplete. Please re-download the model."
-                    print("[InferenceEngine] SSD Streaming Error: \(ssdError.localizedDescription)")
-                    continuation.yield(GenerationToken(text: "\n\n[Error: \(msg)]"))
-                    self.markModelCorrupted(modelId: self.currentModelId, message: msg)
                 } catch {
-                    // Check if the generic error is also an SSD streaming issue
-                    if Self.isModelCorruptionError(error) {
-                        let msg = "Model weights are corrupted or incomplete. Please re-download the model."
-                        self.markModelCorrupted(modelId: self.currentModelId, message: msg)
-                    }
                     continuation.yield(GenerationToken(text: "\n\n[Error: \(error.localizedDescription)]"))
                 }
 
-                if let latchedError = SSDStreamingErrorLatch.shared.consume() {
-                    let msg = "Model weights are corrupted or incomplete. Please re-download the model."
-                    print("[InferenceEngine] Latched SSD error after generation: \(latchedError.localizedDescription)")
-                    self.markModelCorrupted(modelId: self.currentModelId, message: msg)
-                } else if case .error = self.state {
-                    // Already in error state from catch block above
-                } else {
-                    self.state = self.currentModelId.map { .ready(modelId: $0) } ?? .idle
-                }
+                self.state = self.currentModelId.map { .ready(modelId: $0) } ?? .idle
                 continuation.finish()
             }
         }
@@ -629,29 +500,4 @@ public final class InferenceEngine: ObservableObject {
         generationTask = nil
         if let id = currentModelId { state = .ready(modelId: id) }
     }
-
-    /// Delete corrupted model files and start a fresh download.
-    /// Called from the UI when the user confirms re-download after corruption is detected.
-    public func deleteCorruptedAndRedownload() {
-        guard let modelId = corruptedModelId else { return }
-
-        releaseLoadedModelResources()
-        state = .downloading(progress: 0.0, speed: "Deleting corrupted files...")
-
-        do {
-            try ModelStorage.delete(modelId)
-            print("[InferenceEngine] Successfully deleted corrupted cache directory for \(modelId).")
-        } catch {
-            print("[InferenceEngine] FAILED to delete corrupted cache: \(error.localizedDescription)")
-            state = .error("Failed to delete corrupted model: \(error.localizedDescription)")
-            return
-        }
-        downloadManager.refresh()
-        corruptedModelId = nil
-
-        print("[InferenceEngine] Deleted corrupted files for \(modelId), starting fresh download")
-        Task { @MainActor in
-            await downloadThenLoad(modelId: modelId)
-        }
-    }
 }
diff --git a/Sources/MLXInferenceCore/ModelDownloadManager.swift b/Sources/MLXInferenceCore/ModelDownloadManager.swift
index 309bf4c6..5a0c8b07 100644
--- a/Sources/MLXInferenceCore/ModelDownloadManager.swift
+++ b/Sources/MLXInferenceCore/ModelDownloadManager.swift
@@ -52,13 +52,10 @@ public final class ModelDownloadManager: ObservableObject {
 
     // MARK: Published state
     @Published public private(set) var downloadedModels: [DownloadedModel] = []
-    @Published public private(set) var incompleteDownloads: [ModelStorage.IncompleteDownload] = []
     @Published public private(set) var activeDownloads: [String: ModelDownloadProgress] = [:]
     @Published public private(set) var totalDiskUsageBytes: Int64 = 0
     @Published public private(set) var networkStatus: NetworkStatus = .unknown
 
-    private var downloadedModelIDs: Set<String> = []
-
     // MARK: Persistence
     private let lastModelKey = "swiftlm.lastLoadedModelId"
     public var lastLoadedModelId: String? {
@@ -133,86 +130,41 @@ public final class ModelDownloadManager: ObservableObject {
                 modifiedDate: s.modifiedDate
             )
         }
-        downloadedModelIDs = Set(downloadedModels.map(\.id))
         totalDiskUsageBytes = downloadedModels.reduce(0) { $0 + $1.sizeBytes }
-
-        // Scan for interrupted downloads that can be resumed
-        incompleteDownloads = ModelStorage.scanIncompleteDownloads()
-            .filter { incomplete in
-                // Exclude models that are already actively downloading
-                !activeDownloads.keys.contains(incomplete.id)
-            }
     }
 
     public func isDownloaded(_ modelId: String) -> Bool {
-        downloadedModelIDs.contains(modelId)
-    }
-
-    /// True if a model has a partial download that can be resumed.
-    public func hasIncompleteDownload(_ modelId: String) -> Bool {
-        incompleteDownloads.contains { $0.id == modelId }
+        ModelStorage.isDownloaded(modelId)
     }
 
     public func downloadedModel(for modelId: String) -> DownloadedModel? {
         downloadedModels.first(where: { $0.id == modelId })
     }
 
-    /// Delete a model and free disk space (including any partial downloads).
+    /// Delete a model and free disk space.
     public func delete(_ modelId: String) throws {
         try ModelStorage.delete(modelId)
         refresh()
         if lastLoadedModelId == modelId { lastLoadedModelId = nil }
     }
 
-    /// Resume an incomplete download. This calls startDownload() which will
-    /// automatically resume from where it left off (partial files + HTTP Range).
-    @discardableResult
-    public func resumeDownload(modelId: String) -> Task<Void, Error> {
-        return startDownload(modelId: modelId)
-    }
-
     // MARK: — Download
 
-    /// Start downloading a model.
-    /// iOS: Uses ModelDownloader with per-file resume + retry.
-    /// macOS: Uses HubApi.snapshot() which handles resume internally; we add retry around it.
+    /// Start downloading a model (iOS only — macOS goes through LLMModelFactory in InferenceEngine.load()).
     @discardableResult
-    public func startDownload(
-        modelId: String,
-        retryConfig: DownloadRetryConfig = .default
-    ) -> Task<Void, Error> {
-        print("[ModelDownloadManager] startDownload called for \(modelId)")
+    public func startDownload(modelId: String) -> Task<Void, Error> {
         downloadTasks[modelId]?.cancel()
 
         let task = Task<Void, Error> {
-            print("[ModelDownloadManager] Task started for \(modelId)")
-            // Instantly register 0% progress so UI banners appear immediately
-            // before the Hub API computes the file snapshot.
-            Task { @MainActor [weak self] in
-                if self?.activeDownloads[modelId] == nil {
-                    print("[ModelDownloadManager] Registering 0% progress for \(modelId)")
-                    self?.activeDownloads[modelId] = ModelDownloadProgress(
-                        modelId: modelId,
-                        fractionCompleted: 0.0,
-                        currentFile: "Preparing download...",
-                        speedMBps: nil
-                    )
-                }
-            }
-
             do {
                 defer {
-                    print("[ModelDownloadManager] Defer executing, removing activeDownload for \(modelId)")
                     Task { @MainActor [weak self] in
                         self?.activeDownloads.removeValue(forKey: modelId)
                     }
                 }
 
                 #if !os(macOS)
-                try await ModelDownloader.shared.download(
-                    modelId: modelId,
-                    retryConfig: retryConfig
-                ) { [weak self] fp in
+                try await ModelDownloader.shared.download(modelId: modelId) { [weak self] fp in
                     Task { @MainActor [weak self] in
                         self?.activeDownloads[modelId] = ModelDownloadProgress(
                             modelId: modelId,
@@ -223,68 +175,23 @@ public final class ModelDownloadManager: ObservableObject {
                     }
                 }
                 #else
-                // macOS: HubApi.snapshot() already supports resume via incomplete blob
-                // files and HTTP Range headers. We add retry for transient failures.
-                let speedTracker = DownloadSpeedTracker()
-                var lastError: Error?
-                for attempt in 0...retryConfig.maxRetries {
-                    do {
-                        if attempt > 0 {
-                            let delay = retryConfig.delay(for: attempt - 1)
-                            print("[ModelDownloadManager] Retry \(attempt)/\(retryConfig.maxRetries) for \(modelId) after \(String(format: "%.1f", delay))s")
-                            try await Task.sleep(nanoseconds: UInt64(delay * 1_000_000_000))
-                            try Task.checkCancellation()
-                            speedTracker.reset()
+                let hub = HubApi(downloadBase: ModelStorage.cacheRoot)
+                _ = try await hub.snapshot(
+                    from: modelId,
+                    matching: ["*.safetensors", "*.json", "*.model", "*.txt", "*.tiktoken"],
+                    progressHandler: { @Sendable [weak self] progress in
+                        Task { @MainActor [weak self] in
+                            let pct = progress.fractionCompleted
+                            let speedBytesPerSec = progress.userInfo[ProgressUserInfoKey("throughputKey")] as? Double
+                            self?.activeDownloads[modelId] = ModelDownloadProgress(
+                                modelId: modelId,
+                                fractionCompleted: pct,
+                                currentFile: "",
+                                speedMBps: speedBytesPerSec.map { $0 / 1_000_000 }
+                            )
                         }
-
-                        let hub = HubApi(downloadBase: ModelStorage.cacheRoot)
-                        print("[ModelDownloadManager] Calling hub.snapshot for \(modelId)")
-                        _ = try await hub.snapshot(
-                            from: modelId,
-                            matching: ["*.safetensors", "*.json", "*.model", "*.txt", "*.tiktoken"],
-                            progressHandler: { @Sendable [weak self] progress in
-                                // Feed cumulative bytes into the EWMA tracker
-                                speedTracker.record(totalBytes: progress.completedUnitCount)
-                                let smoothedSpeed = speedTracker.speedBytesPerSec
-
-                                Task { @MainActor [weak self] in
-                                    let pct = progress.fractionCompleted
-                                    self?.activeDownloads[modelId] = ModelDownloadProgress(
-                                        modelId: modelId,
-                                        fractionCompleted: pct,
-                                        currentFile: attempt > 0 ? "(retry \(attempt))" : "",
-                                        speedMBps: smoothedSpeed.map { $0 / 1_000_000 }
-                                    )
-                                }
-                            }
-                        )
-                        print("[ModelDownloadManager] hub.snapshot FINISHED SUCCESSFULLY for \(modelId)")
-                        lastError = nil
-                        break  // Success
-                    } catch is CancellationError {
-                        print("[ModelDownloadManager] Task was CANCELLED for \(modelId)")
-                        throw CancellationError()
-                    } catch {
-                        lastError = error
-                        print("[ModelDownloadManager] Download failed for \(modelId): \(error.localizedDescription)")
-                        // Only retry transient network errors
-                        if let urlError = error as? URLError {
-                            switch urlError.code {
-                            case .cancelled, .userCancelledAuthentication:
-                                throw error
-                            case .notConnectedToInternet, .networkConnectionLost,
-                                 .timedOut, .cannotConnectToHost, .dnsLookupFailed:
-                                continue
-                            default:
-                                if attempt >= retryConfig.maxRetries { throw error }
-                                continue
-                            }
-                        }
-                        // Non-URLError (e.g. auth failure) — don't retry
-                        throw error
                     }
-                }
-                if let error = lastError { throw error }
+                )
                 #endif
 
                 Task { @MainActor [weak self] in
diff --git a/Sources/MLXInferenceCore/ModelDownloader.swift b/Sources/MLXInferenceCore/ModelDownloader.swift
index f8652b9f..bb6ca2c9 100644
--- a/Sources/MLXInferenceCore/ModelDownloader.swift
+++ b/Sources/MLXInferenceCore/ModelDownloader.swift
@@ -4,12 +4,8 @@
 //         (called directly from InferenceEngine.load — no separate downloader needed)
 //
 // iOS:   Uses HuggingFace API to enumerate model files, then downloads
-//         each file via URLSession to ModelStorage.cacheRoot
+//         each file via URLSession background session to ModelStorage.cacheRoot
 //         so LLMModelFactory can find them on next load without re-downloading.
-//
-// Both paths support:
-//   • Resume after restart (partial files preserved + HTTP Range)
-//   • Automatic retry with exponential backoff
 
 import Foundation
 import Hub
@@ -25,7 +21,6 @@ public struct DownloadFileProgress: Sendable {
     public let fileFractionCompleted: Double
     public let totalBytesDownloaded: Int64
     public let speedBytesPerSec: Double?
-    public let retryAttempt: Int
 
     public var overallFraction: Double {
         let fileDone = Double(max(fileIndex - 1, 0)) / Double(max(fileCount, 1))
@@ -39,141 +34,6 @@ public struct DownloadFileProgress: Sendable {
             ? String(format: "%.1f MB/s", s / 1_000_000)
             : String(format: "%.0f KB/s", s / 1_000)
     }
-
-    public init(
-        modelId: String,
-        fileName: String,
-        fileIndex: Int,
-        fileCount: Int,
-        fileFractionCompleted: Double,
-        totalBytesDownloaded: Int64,
-        speedBytesPerSec: Double?,
-        retryAttempt: Int = 0
-    ) {
-        self.modelId = modelId
-        self.fileName = fileName
-        self.fileIndex = fileIndex
-        self.fileCount = fileCount
-        self.fileFractionCompleted = fileFractionCompleted
-        self.totalBytesDownloaded = totalBytesDownloaded
-        self.speedBytesPerSec = speedBytesPerSec
-        self.retryAttempt = retryAttempt
-    }
-}
-
-// MARK: — Retry Configuration
-
-public struct DownloadRetryConfig: Sendable {
-    /// Maximum number of retry attempts per file (0 = no retry)
-    public let maxRetries: Int
-    /// Initial delay before the first retry (doubles each attempt)
-    public let initialDelaySeconds: Double
-    /// Maximum delay cap to prevent extremely long waits
-    public let maxDelaySeconds: Double
-
-    public static let `default` = DownloadRetryConfig(
-        maxRetries: 3,
-        initialDelaySeconds: 2.0,
-        maxDelaySeconds: 30.0
-    )
-
-    /// Calculate delay for a given attempt (exponential backoff with jitter)
-    func delay(for attempt: Int) -> TimeInterval {
-        let base = initialDelaySeconds * pow(2.0, Double(attempt))
-        let capped = min(base, maxDelaySeconds)
-        // Add ±25% jitter to prevent thundering herd
-        let jitter = capped * Double.random(in: -0.25...0.25)
-        return max(0.5, capped + jitter)
-    }
-}
-
-// MARK: — Speed Tracker
-
-/// Tracks download speed using an exponentially weighted moving average (EWMA)
-/// over a sliding window. Produces stable, human-readable speed values instead
-/// of volatile per-chunk calculations.
-public final class DownloadSpeedTracker: @unchecked Sendable {
-    private let lock = NSLock()
-
-    /// Samples: (timestamp, cumulativeBytes)
-    private var samples: [(time: TimeInterval, bytes: Int64)] = []
-    /// How far back (seconds) to look for the rolling average
-    private let windowSeconds: TimeInterval
-    /// EWMA smoothing factor (0.0–1.0). Higher = more responsive, lower = smoother.
-    private let alpha: Double
-    /// Current EWMA speed in bytes/sec
-    private var ewmaSpeed: Double = 0
-    /// Absolute start time for elapsed calculation
-    private let startTime: TimeInterval
-    /// Total bytes at start (for resumed downloads)
-    private let startBytes: Int64
-
-    public init(windowSeconds: TimeInterval = 5.0, alpha: Double = 0.3, resumeOffset: Int64 = 0) {
-        self.windowSeconds = windowSeconds
-        self.alpha = alpha
-        self.startTime = ProcessInfo.processInfo.systemUptime
-        self.startBytes = resumeOffset
-    }
-
-    /// Record a cumulative byte count at the current time.
-    public func record(totalBytes: Int64) {
-        lock.lock()
-        defer { lock.unlock() }
-
-        let now = ProcessInfo.processInfo.systemUptime
-        samples.append((time: now, bytes: totalBytes))
-
-        // Prune samples older than the window
-        let cutoff = now - windowSeconds
-        samples.removeAll { $0.time < cutoff }
-
-        // Calculate instantaneous speed from oldest sample in window
-        if samples.count >= 2, let oldest = samples.first {
-            let dt = now - oldest.time
-            if dt > 0.1 {
-                let instantSpeed = Double(totalBytes - oldest.bytes) / dt
-                // EWMA blend
-                if ewmaSpeed == 0 {
-                    ewmaSpeed = instantSpeed
-                } else {
-                    ewmaSpeed = alpha * instantSpeed + (1 - alpha) * ewmaSpeed
-                }
-            }
-        }
-    }
-
-    /// Current smoothed speed in bytes/sec. Returns nil if no meaningful data yet.
-    public var speedBytesPerSec: Double? {
-        lock.lock()
-        defer { lock.unlock() }
-        return ewmaSpeed > 0 ? ewmaSpeed : nil
-    }
-
-    /// Overall average speed since tracking began (bytes/sec).
-    public func overallAverageSpeed(currentBytes: Int64) -> Double? {
-        let elapsed = ProcessInfo.processInfo.systemUptime - startTime
-        guard elapsed > 0.5 else { return nil }
-        let downloaded = currentBytes - startBytes
-        return downloaded > 0 ? Double(downloaded) / elapsed : nil
-    }
-
-    /// Reset for a new file while keeping the tracker alive.
-    public func reset(resumeOffset: Int64 = 0) {
-        lock.lock()
-        defer { lock.unlock() }
-        samples.removeAll()
-        ewmaSpeed = 0
-    }
-}
-
-private struct DownloadedFileSizeMismatchError: LocalizedError {
-    let fileName: String
-    let expectedSize: Int64
-    let actualSize: Int64
-
-    var errorDescription: String? {
-        "Downloaded file \(fileName) is incomplete (expected \(expectedSize) bytes, got \(actualSize))."
-    }
 }
 
 // MARK: — Downloader actor
@@ -183,16 +43,18 @@ public actor ModelDownloader {
     public static let shared = ModelDownloader()
     private init() {}
 
-    // MARK: — iOS: URLSession download with resume + retry
+    // MARK: — iOS: URLSession background download
 
     #if !os(macOS)
 
-    private lazy var session: URLSession = {
-        let config = URLSessionConfiguration.default
+    private lazy var backgroundSession: URLSession = {
+        let config = URLSessionConfiguration.background(
+            withIdentifier: "com.sharpai.swiftlmchat.modeldownload"
+        )
+        config.isDiscretionary = false
+        config.sessionSendsLaunchEvents = true
         config.allowsConstrainedNetworkAccess = true
         config.allowsExpensiveNetworkAccess = true
-        config.timeoutIntervalForRequest = 60
-        config.timeoutIntervalForResource = 3600  // 1 hour for large files
         return URLSession(configuration: config)
     }()
 
@@ -201,12 +63,11 @@ public actor ModelDownloader {
         let siblings: [HFFile]
         struct HFFile: Decodable {
             let rfilename: String
-            let size: Int64?
         }
     }
 
     /// Fetch the file list for a model from the HuggingFace REST API.
-    private func fetchFileList(modelId: String) async throws -> [(name: String, size: Int64?)] {
+    private func fetchFileList(modelId: String) async throws -> [String] {
         let url = URL(string: "https://huggingface.co/api/models/\(modelId)")!
         let (data, response) = try await URLSession.shared.data(from: url)
         guard let http = response as? HTTPURLResponse, http.statusCode == 200 else {
@@ -214,211 +75,37 @@ public actor ModelDownloader {
         }
         let info = try JSONDecoder().decode(HFModelInfo.self, from: data)
         return info.siblings
+            .map { $0.rfilename }
             .filter { name in
-                !name.rfilename.hasSuffix(".bin")     // Skip PyTorch weights
-                && !name.rfilename.hasSuffix(".ot")
-                && !name.rfilename.contains(".gguf")
+                !name.hasSuffix(".bin")     // Skip PyTorch weights
+                && !name.hasSuffix(".ot")
+                && !name.contains(".gguf")
             }
-            .map { ($0.rfilename, $0.size) }
     }
 
-    /// Download a single file from HuggingFace to `targetDir` with resume support.
-    ///
-    /// Uses a `.incomplete` suffix for in-progress downloads. If a partial file
-    /// exists from a previous attempt, sends an HTTP Range header to resume.
-    private func downloadFile(
-        modelId: String,
-        fileName: String,
-        expectedSize: Int64?,
-        targetDir: URL,
-        speedTracker: DownloadSpeedTracker,
-        onProgress: @Sendable (Double, Double?) -> Void
-    ) async throws {
+    /// Download a single file from HuggingFace to `targetDir`.
+    private func downloadFile(modelId: String, fileName: String, targetDir: URL) async throws {
         let fileURL = URL(string: "https://huggingface.co/\(modelId)/resolve/main/\(fileName)")!
         let destURL = targetDir.appendingPathComponent(fileName)
-        let incompleteURL = destURL.appendingPathExtension("incomplete")
 
         // Create subdirectories if needed (e.g. for tokenizer/config subpaths)
         let parentDir = destURL.deletingLastPathComponent()
         try FileManager.default.createDirectory(at: parentDir, withIntermediateDirectories: true)
 
-        // Already downloaded — verify size if known, skip if good
-        if FileManager.default.fileExists(atPath: destURL.path) {
-            if let expected = expectedSize {
-                let actual = (try? FileManager.default.attributesOfItem(atPath: destURL.path)[.size] as? Int64) ?? 0
-                if actual == expected {
-                    onProgress(1.0, speedTracker.speedBytesPerSec)
-                    return
-                }
-                // Size mismatch — remove and re-download
-                try? FileManager.default.removeItem(at: destURL)
-            } else {
-                onProgress(1.0, speedTracker.speedBytesPerSec)
-                return
-            }
-        }
+        if FileManager.default.fileExists(atPath: destURL.path) { return }
 
-        // Check for a partial download from a previous session
-        var resumeOffset: Int64 = 0
-        if FileManager.default.fileExists(atPath: incompleteURL.path) {
-            resumeOffset = (try? FileManager.default.attributesOfItem(atPath: incompleteURL.path)[.size] as? Int64) ?? 0
-        }
-
-        var request = URLRequest(url: fileURL)
-        if resumeOffset > 0 {
-            request.setValue("bytes=\(resumeOffset)-", forHTTPHeaderField: "Range")
-        }
-
-        // Stream download using bytes(for:) for progress tracking
-        let (asyncBytes, response) = try await session.bytes(for: request)
-        guard let http = response as? HTTPURLResponse else {
-            throw URLError(.badServerResponse)
-        }
-
-        // Handle 416 Range Not Satisfiable — partial file is stale, restart
-        if http.statusCode == 416 {
-            try? FileManager.default.removeItem(at: incompleteURL)
-            resumeOffset = 0
-            speedTracker.reset()
-            // Retry without Range header
-            let freshRequest = URLRequest(url: fileURL)
-            let (freshBytes, freshResponse) = try await session.bytes(for: freshRequest)
-            guard let freshHttp = freshResponse as? HTTPURLResponse, (200..<300).contains(freshHttp.statusCode) else {
-                throw URLError(.badServerResponse)
-            }
-            let totalSize = freshHttp.expectedContentLength > 0 ? freshHttp.expectedContentLength : (expectedSize ?? 0)
-            let writtenSize = try await streamToFile(
-                asyncBytes: freshBytes,
-                destURL: incompleteURL,
-                resumeOffset: 0,
-                totalSize: totalSize,
-                speedTracker: speedTracker,
-                onProgress: onProgress
-            )
-            try validateCompletedDownloadSize(
-                fileName: fileName,
-                actualSize: writtenSize,
-                expectedSize: totalSize > 0 ? totalSize : expectedSize
-            )
-        } else if (200..<300).contains(http.statusCode) {
-            // 200 = full content (server ignored Range), 206 = partial content (resume worked)
-            let isResume = (http.statusCode == 206)
-            if !isResume {
-                // Server returned full content — discard partial file
-                try? FileManager.default.removeItem(at: incompleteURL)
-                resumeOffset = 0
-                speedTracker.reset()
-            }
-            let totalSize: Int64
-            if isResume {
-                totalSize = resumeOffset + http.expectedContentLength
-            } else {
-                totalSize = http.expectedContentLength > 0 ? http.expectedContentLength : (expectedSize ?? 0)
-            }
-            let writtenSize = try await streamToFile(
-                asyncBytes: asyncBytes,
-                destURL: incompleteURL,
-                resumeOffset: isResume ? resumeOffset : 0,
-                totalSize: totalSize,
-                speedTracker: speedTracker,
-                onProgress: onProgress
-            )
-            try validateCompletedDownloadSize(
-                fileName: fileName,
-                actualSize: writtenSize,
-                expectedSize: totalSize > 0 ? totalSize : expectedSize
-            )
-        } else {
+        let (tmpURL, response) = try await backgroundSession.download(from: fileURL)
+        guard let http = response as? HTTPURLResponse, http.statusCode == 200 else {
             throw URLError(.badServerResponse)
         }
-
-        // Atomic move from .incomplete to final destination
         try? FileManager.default.removeItem(at: destURL)
-        try FileManager.default.moveItem(at: incompleteURL, to: destURL)
-        onProgress(1.0, speedTracker.speedBytesPerSec)
-    }
-
-    /// Stream async bytes to a file, appending if resuming.
-    private func streamToFile(
-        asyncBytes: URLSession.AsyncBytes,
-        destURL: URL,
-        resumeOffset: Int64,
-        totalSize: Int64,
-        speedTracker: DownloadSpeedTracker,
-        onProgress: @Sendable (Double, Double?) -> Void
-    ) async throws -> Int64 {
-        let fileHandle: FileHandle
-        if resumeOffset > 0, FileManager.default.fileExists(atPath: destURL.path) {
-            fileHandle = try FileHandle(forWritingTo: destURL)
-            fileHandle.seekToEndOfFile()
-        } else {
-            FileManager.default.createFile(atPath: destURL.path, contents: nil)
-            fileHandle = try FileHandle(forWritingTo: destURL)
-        }
-        defer { try? fileHandle.close() }
-
-        let flushSize = 256 * 1024  // Flush every 256 KB
-        var lastProgressUpdate = Date()
-        var bytesWritten: Int64 = resumeOffset
-        var iterator = asyncBytes.makeAsyncIterator()
-        var chunkBuffer = [UInt8](repeating: 0, count: flushSize)
-
-        while true {
-            try Task.checkCancellation()
-
-            var chunkCount = 0
-            while chunkCount < chunkBuffer.count,
-                  let byte = try await iterator.next() {
-                chunkBuffer[chunkCount] = byte
-                chunkCount += 1
-            }
-
-            if chunkCount == 0 {
-                break
-            }
-
-            fileHandle.write(Data(chunkBuffer[0..<chunkCount]))
-            bytesWritten += Int64(chunkCount)
-
-            speedTracker.record(totalBytes: bytesWritten)
-
-            let now = Date()
-            if now.timeIntervalSince(lastProgressUpdate) >= 0.1 {
-                lastProgressUpdate = now
-                if totalSize > 0 {
-                    onProgress(Double(bytesWritten) / Double(totalSize), speedTracker.speedBytesPerSec)
-                }
-            }
-        }
-
-        if totalSize > 0 {
-            onProgress(Double(bytesWritten) / Double(totalSize), speedTracker.speedBytesPerSec)
-        }
-        return bytesWritten
-    }
-
-    private func validateCompletedDownloadSize(
-        fileName: String,
-        actualSize: Int64,
-        expectedSize: Int64?
-    ) throws {
-        guard let expectedSize, expectedSize > 0, actualSize != expectedSize else { return }
-        throw DownloadedFileSizeMismatchError(
-            fileName: fileName,
-            expectedSize: expectedSize,
-            actualSize: actualSize
-        )
+        try FileManager.default.moveItem(at: tmpURL, to: destURL)
     }
 
     /// Download all model files to `ModelStorage.cacheRoot` in the Hugging Face
     /// hub format expected by `LLMModelFactory.loadContainer()`.
-    ///
-    /// Supports:
-    /// - **Resume after restart**: partial `.incomplete` files are preserved and resumed via HTTP Range
-    /// - **Automatic retry**: transient network errors retry with exponential backoff
     public func download(
         modelId: String,
-        retryConfig: DownloadRetryConfig = .default,
         onProgress: @escaping @Sendable (DownloadFileProgress) -> Void
     ) async throws {
         let files = try await fetchFileList(modelId: modelId)
@@ -435,98 +122,39 @@ public actor ModelDownloader {
         )
 
         var totalDownloaded: Int64 = 0
-        let speedTracker = DownloadSpeedTracker()
 
-        for (idx, file) in files.enumerated() {
+        for (idx, fileName) in files.enumerated() {
             try Task.checkCancellation()
 
+            let startTime = Date()
             let before = ModelStorage.directorySize(at: snapshotDir)
-            var lastError: Error?
-
-            // Reset speed tracker per-file so the EWMA starts fresh
-            speedTracker.reset()
-
-            // Retry loop with exponential backoff
-            for attempt in 0...retryConfig.maxRetries {
-                do {
-                    if attempt > 0 {
-                        let delay = retryConfig.delay(for: attempt - 1)
-                        print("[ModelDownloader] Retry \(attempt)/\(retryConfig.maxRetries) for \(file.name) after \(String(format: "%.1f", delay))s")
-                        try await Task.sleep(nanoseconds: UInt64(delay * 1_000_000_000))
-                        try Task.checkCancellation()
-                    }
-
-                    onProgress(DownloadFileProgress(
-                        modelId: modelId,
-                        fileName: file.name,
-                        fileIndex: idx + 1,
-                        fileCount: files.count,
-                        fileFractionCompleted: 0,
-                        totalBytesDownloaded: totalDownloaded,
-                        speedBytesPerSec: speedTracker.speedBytesPerSec,
-                        retryAttempt: attempt
-                    ))
-
-                    try await downloadFile(
-                        modelId: modelId,
-                        fileName: file.name,
-                        expectedSize: file.size,
-                        targetDir: snapshotDir,
-                        speedTracker: speedTracker
-                    ) { fraction, speed in
-                        onProgress(DownloadFileProgress(
-                            modelId: modelId,
-                            fileName: file.name,
-                            fileIndex: idx + 1,
-                            fileCount: files.count,
-                            fileFractionCompleted: fraction,
-                            totalBytesDownloaded: totalDownloaded,
-                            speedBytesPerSec: speed,
-                            retryAttempt: attempt
-                        ))
-                    }
-
-                    lastError = nil
-                    break  // Success — exit retry loop
 
-                } catch is CancellationError {
-                    throw CancellationError()
-                } catch {
-                    lastError = error
-                    print("[ModelDownloader] Download failed for \(file.name): \(error.localizedDescription)")
-                    // Don't retry on non-transient errors
-                    if let urlError = error as? URLError {
-                        switch urlError.code {
-                        case .cancelled, .userCancelledAuthentication:
-                            throw error
-                        case .notConnectedToInternet, .networkConnectionLost,
-                             .timedOut, .cannotConnectToHost, .dnsLookupFailed:
-                            continue  // Transient — retry
-                        default:
-                            if attempt >= retryConfig.maxRetries { throw error }
-                            continue
-                        }
-                    }
-                }
-            }
+            onProgress(DownloadFileProgress(
+                modelId: modelId,
+                fileName: fileName,
+                fileIndex: idx + 1,
+                fileCount: files.count,
+                fileFractionCompleted: 0,
+                totalBytesDownloaded: totalDownloaded,
+                speedBytesPerSec: nil
+            ))
 
-            if let error = lastError {
-                throw error
-            }
+            try await downloadFile(modelId: modelId, fileName: fileName, targetDir: snapshotDir)
 
             let after = ModelStorage.directorySize(at: snapshotDir)
             let downloaded = max(0, after - before)
             totalDownloaded += downloaded
+            let elapsed = max(Date().timeIntervalSince(startTime), 0.001)
+            let speed = Double(downloaded) / elapsed
 
             onProgress(DownloadFileProgress(
                 modelId: modelId,
-                fileName: file.name,
+                fileName: fileName,
                 fileIndex: idx + 1,
                 fileCount: files.count,
                 fileFractionCompleted: 1.0,
                 totalBytesDownloaded: totalDownloaded,
-                speedBytesPerSec: speedTracker.speedBytesPerSec,
-                retryAttempt: 0
+                speedBytesPerSec: speed
             ))
         }
     }
diff --git a/Sources/MLXInferenceCore/ModelStorage.swift b/Sources/MLXInferenceCore/ModelStorage.swift
index ddd1e507..758e36be 100644
--- a/Sources/MLXInferenceCore/ModelStorage.swift
+++ b/Sources/MLXInferenceCore/ModelStorage.swift
@@ -43,216 +43,24 @@ public enum ModelStorage {
 
     /// Local cache directory for a model, or nil if not downloaded.
     public static func cacheDirectory(for modelId: String) -> URL? {
-        materializedDirectory(for: modelId) ?? hubCacheDirectory(for: modelId)
-    }
-
-    /// Swift Hub's materialized repository directory.
-    /// `HubApi(downloadBase: cacheRoot).snapshot(from:)` writes here.
-    public static func materializedDirectory(for modelId: String) -> URL? {
-        let dir = materializedDirectoryURL(for: modelId)
-        return directoryExists(dir) ? dir : nil
-    }
-
-    private static func materializedDirectoryURL(for modelId: String) -> URL {
-        cacheRoot
-            .appendingPathComponent("models", isDirectory: true)
-            .appendingPathComponent(modelId, isDirectory: true)
-    }
-
-    /// Hugging Face hub cache directory used by Python tools and older SwiftBuddy paths.
-    public static func hubCacheDirectory(for modelId: String) -> URL? {
-        let dir = hubCacheDirectoryURL(for: modelId)
-        return directoryExists(dir) ? dir : nil
-    }
-
-    private static func hubCacheDirectoryURL(for modelId: String) -> URL {
-        cacheRoot.appendingPathComponent(hubDirName(for: modelId), isDirectory: true)
-    }
-
-    private static func directoryExists(_ url: URL) -> Bool {
-        var isDirectory: ObjCBool = false
-        return FileManager.default.fileExists(atPath: url.path, isDirectory: &isDirectory) && isDirectory.boolValue
+        let dir = cacheRoot.appendingPathComponent(hubDirName(for: modelId))
+        return FileManager.default.fileExists(atPath: dir.path) ? dir : nil
     }
 
     /// True if a model's cache directory exists and contains files.
     // The snapshot directory is where safetensors files live inside the HF hub layout:
     // <cacheRoot>/models--org--name/snapshots/main/
     public static func snapshotDirectory(for modelId: String) -> URL {
-        return materializedDirectory(for: modelId) ?? resolvedSnapshotDirectory(for: modelId) ?? cacheRoot
+        return cacheRoot
             .appendingPathComponent(hubDirName(for: modelId))
             .appendingPathComponent("snapshots/main")
     }
 
-    /// Resolve the active snapshot directory for a model in the Hugging Face hub cache.
-    /// Prefer refs/main because snapshot directories are usually commit hashes, not "main".
-    public static func resolvedSnapshotDirectory(for modelId: String) -> URL? {
-        guard let dir = hubCacheDirectory(for: modelId) else { return nil }
-
-        let snapshotsDir = dir.appendingPathComponent("snapshots", isDirectory: true)
-        guard FileManager.default.fileExists(atPath: snapshotsDir.path) else { return nil }
-
-        let refsMain = dir.appendingPathComponent("refs/main")
-        if let hashString = try? String(contentsOf: refsMain, encoding: .utf8)
-            .trimmingCharacters(in: .whitespacesAndNewlines),
-           !hashString.isEmpty {
-            let snapshot = snapshotsDir.appendingPathComponent(hashString, isDirectory: true)
-            if FileManager.default.fileExists(atPath: snapshot.path) {
-                return snapshot
-            }
-        }
-
-        let mainSnapshot = snapshotsDir.appendingPathComponent("main", isDirectory: true)
-        if FileManager.default.fileExists(atPath: mainSnapshot.path) {
-            return mainSnapshot
-        }
-
-        guard let contents = try? FileManager.default.contentsOfDirectory(
-            at: snapshotsDir,
-            includingPropertiesForKeys: [.isDirectoryKey],
-            options: [.skipsHiddenFiles]
-        ) else { return nil }
-
-        let directories = contents.filter { url in
-            (try? url.resourceValues(forKeys: [.isDirectoryKey]).isDirectory) == true
-        }
-        return directories.count == 1 ? directories[0] : nil
-    }
-
     public static func isDownloaded(_ modelId: String) -> Bool {
-        verifyModelIntegrity(for: modelId, logFailures: false)
-    }
-
-    // MARK: — Model Config Inspection
-
-    /// Read the model's maximum context length from its config.json.
-    /// Checks `text_config.max_position_embeddings` first (VLM/MoE models),
-    /// then falls back to top-level `max_position_embeddings`.
-    public static func readMaxContextLength(for modelId: String) -> Int? {
-        guard let config = readModelConfig(for: modelId) else { return nil }
-
-        // VLM/MoE models nest the context length in text_config
-        if let textConfig = config["text_config"] as? [String: Any],
-           let maxPos = textConfig["max_position_embeddings"] as? Int {
-            return maxPos
-        }
-
-        // Standard LLMs have it at top level
-        if let maxPos = config["max_position_embeddings"] as? Int {
-            return maxPos
-        }
-
-        // Fallback: some models use n_ctx or max_seq_len
-        if let nCtx = config["n_ctx"] as? Int { return nCtx }
-        if let maxSeq = config["max_seq_len"] as? Int { return maxSeq }
-
-        return nil
-    }
-
-    /// Read the raw config.json dictionary for a downloaded model.
-    /// Verifies that all required safetensors files are present in the snapshot directory.
-    /// This prevents the engine from entering `.ready` state if a download was interrupted or corrupted.
-    public static func verifyModelIntegrity(for modelId: String) -> Bool {
-        verifyModelIntegrity(for: modelId, logFailures: true)
-    }
-
-    private static func verifyModelIntegrity(for modelId: String, logFailures: Bool) -> Bool {
-        if hasIncompleteFiles(for: modelId) {
-            if logFailures { print("[ModelStorage] Integrity Check Failed: Incomplete download files remain for \(modelId)") }
-            return false
-        }
-
-        for directory in modelContentDirectories(for: modelId) {
-            if validateModelFiles(in: directory, logFailures: logFailures) {
-                return true
-            }
-        }
-
-        if logFailures { print("[ModelStorage] Integrity Check Failed: No valid model files found for \(modelId)") }
-        return false
-    }
-
-    private static func modelContentDirectories(for modelId: String) -> [URL] {
-        var directories: [URL] = []
-        if let materialized = materializedDirectory(for: modelId) {
-            directories.append(materialized)
-        }
-        if let snapshot = resolvedSnapshotDirectory(for: modelId), !directories.contains(snapshot) {
-            directories.append(snapshot)
-        }
-        return directories
-    }
-
-    private static func validateModelFiles(in snapshotDir: URL, logFailures: Bool) -> Bool {
-        // 0. Verify core metadata files
-        let requiredJsonFiles = ["config.json", "tokenizer.json"]
-        for file in requiredJsonFiles {
-            let path = snapshotDir.appendingPathComponent(file)
-            if !FileManager.default.fileExists(atPath: path.path) {
-                // Some models might not have tokenizer.json if they use tokenizer.model, so we only strictly enforce config.json
-                if file == "config.json" {
-                    if logFailures { print("[ModelStorage] Integrity Check Failed: Missing \(file) in \(snapshotDir.path)") }
-                    return false
-                }
-            } else if fileSizeResolvingSymlink(path) == 0 {
-                if logFailures { print("[ModelStorage] Integrity Check Failed: \(file) is corrupted (0 bytes)") }
-                return false
-            }
-        }
-
-        // 1. Try to read model.safetensors.index.json
-        let indexJsonPath = snapshotDir.appendingPathComponent("model.safetensors.index.json")
-        if FileManager.default.fileExists(atPath: indexJsonPath.path) {
-            guard let data = try? Data(contentsOf: indexJsonPath),
-                  let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
-                  let weightMap = json["weight_map"] as? [String: String] else {
-                return false
-            }
-            // Collect all unique safetensors filenames
-            let requiredFiles = Set(weightMap.values)
-            var totalShardBytes: Int64 = 0
-            for file in requiredFiles {
-                let filePath = snapshotDir.appendingPathComponent(file)
-                guard let size = fileSizeResolvingSymlink(filePath) else {
-                    if logFailures { print("[ModelStorage] Integrity Check Failed: Missing \(file)") }
-                    return false
-                }
-                guard size > 1024 else {
-                    if logFailures { print("[ModelStorage] Integrity Check Failed: \(file) is too small (\(size) bytes)") }
-                    return false
-                }
-                totalShardBytes += size
-            }
-
-            if let metadata = json["metadata"] as? [String: Any],
-               let expectedTensorBytes = int64Value(metadata["total_size"]),
-               totalShardBytes < expectedTensorBytes {
-                if logFailures {
-                    print("[ModelStorage] Integrity Check Failed: shard bytes \(totalShardBytes) below index total_size \(expectedTensorBytes)")
-                }
-                return false
-            }
-            return true
-        }
-
-        // 2. If no index.json, it might be a single safetensors file model
-        let singleSafetensors = snapshotDir.appendingPathComponent("model.safetensors")
-        if let size = fileSizeResolvingSymlink(singleSafetensors), size > 1024 {
-            return true
-        }
-
-        if logFailures { print("[ModelStorage] Integrity Check Failed: No safetensors found in \(snapshotDir.path)") }
-        return false
-    }
-
-    public static func readModelConfig(for modelId: String) -> [String: Any]? {
-        for directory in modelContentDirectories(for: modelId) {
-            let configPath = directory.appendingPathComponent("config.json")
-            guard let data = try? Data(contentsOf: configPath),
-                  let config = try? JSONSerialization.jsonObject(with: data) as? [String: Any]
-            else { continue }
-            return config
-        }
-        return nil
+        guard let dir = cacheDirectory(for: modelId) else { return false }
+        // Must have a snapshots subdirectory with content
+        let snapshots = dir.appendingPathComponent("snapshots")
+        return FileManager.default.fileExists(atPath: snapshots.path)
     }
 
     // MARK: — Disk Operations
@@ -265,20 +73,14 @@ public enum ModelStorage {
 
     /// Bytes used by a specific model on disk.
     public static func sizeOnDisk(for modelId: String) -> Int64 {
-        associatedDirectories(for: modelId).reduce(Int64(0)) { $0 + directorySize(at: $1) }
+        guard let dir = cacheDirectory(for: modelId) else { return 0 }
+        return directorySize(at: dir)
     }
 
     /// Delete all cached files for a model.
     public static func delete(_ modelId: String) throws {
-        var firstError: Error?
-        for dir in associatedDirectories(for: modelId) {
-            do {
-                try FileManager.default.removeItem(at: dir)
-            } catch {
-                if firstError == nil { firstError = error }
-            }
-        }
-        if let firstError { throw firstError }
+        guard let dir = cacheDirectory(for: modelId) else { return }
+        try FileManager.default.removeItem(at: dir)
     }
 
     // MARK: — iCloud Exclusion (iOS)
@@ -312,200 +114,31 @@ public enum ModelStorage {
               )
         else { return [] }
 
-        var resultsById: [String: ScannedModel] = [:]
+        var results: [ScannedModel] = []
         for dir in contents {
-            if dir.lastPathComponent.hasPrefix("models--") {
-                let modelId = dir.lastPathComponent
-                    .replacingOccurrences(of: "^models--", with: "", options: .regularExpression)
-                    .replacingOccurrences(of: "--", with: "/")
-                addScannedModelIfDownloaded(modelId: modelId, dir: dir, resultsById: &resultsById)
-            } else if dir.lastPathComponent == "models" {
-                guard let organizations = try? FileManager.default.contentsOfDirectory(
-                    at: dir,
-                    includingPropertiesForKeys: [.contentModificationDateKey],
-                    options: [.skipsHiddenFiles]
-                ) else { continue }
-
-                for organization in organizations where directoryExists(organization) {
-                    guard let modelDirs = try? FileManager.default.contentsOfDirectory(
-                        at: organization,
-                        includingPropertiesForKeys: [.contentModificationDateKey],
-                        options: [.skipsHiddenFiles]
-                    ) else { continue }
-
-                    for modelDir in modelDirs where directoryExists(modelDir) {
-                        let modelId = "\(organization.lastPathComponent)/\(modelDir.lastPathComponent)"
-                        addScannedModelIfDownloaded(modelId: modelId, dir: modelDir, resultsById: &resultsById)
-                    }
-                }
-            }
-        }
-        return resultsById.values.sorted { ($0.modifiedDate ?? .distantPast) > ($1.modifiedDate ?? .distantPast) }
-    }
+            guard dir.lastPathComponent.hasPrefix("models--") else { continue }
 
-    private static func addScannedModelIfDownloaded(
-        modelId: String,
-        dir: URL,
-        resultsById: inout [String: ScannedModel]
-    ) {
-        guard isDownloaded(modelId) else { return }
+            // Reverse the naming convention to get the model ID
+            let modelId = dir.lastPathComponent
+                .replacingOccurrences(of: "^models--", with: "", options: .regularExpression)
+                .replacingOccurrences(of: "--", with: "/")
 
-        let modified = (try? dir.resourceValues(forKeys: [.contentModificationDateKey]))?.contentModificationDate
-        let candidate = ScannedModel(
-            modelId: modelId,
-            cacheDirectory: cacheDirectory(for: modelId) ?? dir,
-            sizeBytes: sizeOnDisk(for: modelId),
-            modifiedDate: modified
-        )
+            // Do NOT filter by ModelCatalog anymore => allow arbitrary downloaded Hugging Face models!
+            guard isDownloaded(modelId) else { continue }  // skip partial downloads
 
-        if let existing = resultsById[modelId],
-           (existing.modifiedDate ?? .distantPast) >= (candidate.modifiedDate ?? .distantPast) {
-            return
+            let modified = (try? dir.resourceValues(forKeys: [.contentModificationDateKey]))?.contentModificationDate
+            results.append(ScannedModel(
+                modelId: modelId,
+                cacheDirectory: dir,
+                sizeBytes: directorySize(at: dir),
+                modifiedDate: modified
+            ))
         }
-        resultsById[modelId] = candidate
-    }
-
-    // MARK: — Incomplete Downloads
-
-    /// A model whose download was interrupted and can be resumed.
-    public struct IncompleteDownload: Identifiable, Sendable {
-        public let id: String  // modelId
-        public let cacheDirectory: URL
-        /// Bytes downloaded so far (sum of complete + incomplete files)
-        public let downloadedBytes: Int64
-        /// When the partial download was last modified
-        public let lastModified: Date?
-    }
-
-    /// Check whether a model directory has any `.incomplete` partial files (iOS path)
-    /// or incomplete blobs (macOS HubApi path).
-    public static func hasIncompleteFiles(for modelId: String) -> Bool {
-        associatedDirectories(for: modelId).contains { countIncompleteFiles(in: $0) > 0 }
-    }
-
-    /// Scan the cache root for model directories that have partial downloads
-    /// but are NOT fully downloaded (i.e. `isDownloaded()` returns false, or
-    /// the directory contains `.incomplete` files).
-    public static func scanIncompleteDownloads() -> [IncompleteDownload] {
-        guard FileManager.default.fileExists(atPath: cacheRoot.path),
-              let contents = try? FileManager.default.contentsOfDirectory(
-                at: cacheRoot,
-                includingPropertiesForKeys: [.contentModificationDateKey],
-                options: [.skipsHiddenFiles]
-              )
-        else { return [] }
-
-        var resultsById: [String: IncompleteDownload] = [:]
-        for dir in contents {
-            if dir.lastPathComponent.hasPrefix("models--") {
-                let modelId = dir.lastPathComponent
-                    .replacingOccurrences(of: "^models--", with: "", options: .regularExpression)
-                    .replacingOccurrences(of: "--", with: "/")
-                addIncompleteDownloadIfNeeded(modelId: modelId, dir: dir, resultsById: &resultsById)
-            } else if dir.lastPathComponent == "models" {
-                guard let organizations = try? FileManager.default.contentsOfDirectory(
-                    at: dir,
-                    includingPropertiesForKeys: [.contentModificationDateKey],
-                    options: [.skipsHiddenFiles]
-                ) else { continue }
-
-                for organization in organizations where directoryExists(organization) {
-                    guard let modelDirs = try? FileManager.default.contentsOfDirectory(
-                        at: organization,
-                        includingPropertiesForKeys: [.contentModificationDateKey],
-                        options: [.skipsHiddenFiles]
-                    ) else { continue }
-
-                    for modelDir in modelDirs where directoryExists(modelDir) {
-                        let modelId = "\(organization.lastPathComponent)/\(modelDir.lastPathComponent)"
-                        addIncompleteDownloadIfNeeded(modelId: modelId, dir: modelDir, resultsById: &resultsById)
-                    }
-                }
-            }
-        }
-        return resultsById.values.sorted { ($0.lastModified ?? .distantPast) > ($1.lastModified ?? .distantPast) }
-    }
-
-    private static func addIncompleteDownloadIfNeeded(
-        modelId: String,
-        dir: URL,
-        resultsById: inout [String: IncompleteDownload]
-    ) {
-        // Skip fully completed models unless they have leftover .incomplete files.
-        if isDownloaded(modelId) && !hasIncompleteFiles(for: modelId) {
-            return
-        }
-
-        // Must have SOME content (not just an empty directory).
-        let size = directorySize(at: dir)
-        guard size > 0 else { return }
-
-        let modified = (try? dir.resourceValues(forKeys: [.contentModificationDateKey]))?.contentModificationDate
-        let candidate = IncompleteDownload(
-            id: modelId,
-            cacheDirectory: dir,
-            downloadedBytes: size,
-            lastModified: modified
-        )
-
-        if let existing = resultsById[modelId],
-           (existing.lastModified ?? .distantPast) >= (candidate.lastModified ?? .distantPast) {
-            return
-        }
-        resultsById[modelId] = candidate
-    }
-
-    /// Count `.incomplete` files in a directory tree.
-    private static func countIncompleteFiles(in directory: URL) -> Int {
-        guard let enumerator = FileManager.default.enumerator(
-            at: directory,
-            includingPropertiesForKeys: nil,
-            options: [.skipsHiddenFiles]
-        ) else { return 0 }
-
-        var count = 0
-        for case let fileURL as URL in enumerator {
-            if fileURL.pathExtension == "incomplete" {
-                count += 1
-            }
-        }
-        return count
+        return results.sorted { ($0.modifiedDate ?? .distantPast) > ($1.modifiedDate ?? .distantPast) }
     }
 
     // MARK: — Helpers
 
-    private static func associatedDirectories(for modelId: String) -> [URL] {
-        let candidates = [
-            materializedDirectoryURL(for: modelId),
-            hubCacheDirectoryURL(for: modelId),
-        ]
-
-        var seen = Set<String>()
-        return candidates.filter { url in
-            guard directoryExists(url), !seen.contains(url.path) else { return false }
-            seen.insert(url.path)
-            return true
-        }
-    }
-
-    private static func fileSizeResolvingSymlink(_ url: URL) -> Int64? {
-        let resolved = url.resolvingSymlinksInPath()
-        guard let attrs = try? FileManager.default.attributesOfItem(atPath: resolved.path) else { return nil }
-        if let size = attrs[.size] as? Int64 { return size }
-        if let size = attrs[.size] as? NSNumber { return size.int64Value }
-        return nil
-    }
-
-    private static func int64Value(_ value: Any?) -> Int64? {
-        switch value {
-        case let value as Int64: return value
-        case let value as Int: return Int64(value)
-        case let value as NSNumber: return value.int64Value
-        case let value as String: return Int64(value)
-        default: return nil
-        }
-    }
-
     private static func ensureDirectory(_ url: URL) {
         guard !FileManager.default.fileExists(atPath: url.path) else { return }
         try? FileManager.default.createDirectory(at: url, withIntermediateDirectories: true)
diff --git a/Sources/SwiftLM/ModelProfiler.swift b/Sources/SwiftLM/ModelProfiler.swift
index ea5f76a8..2cf40d85 100644
--- a/Sources/SwiftLM/ModelProfiler.swift
+++ b/Sources/SwiftLM/ModelProfiler.swift
@@ -12,6 +12,16 @@
 
 import Foundation
 import MLX
+import Darwin
+
+// C-aligned struct for vm.swapusage (Darwin)
+struct xsw_usage {
+    var xsu_total: UInt64
+    var xsu_used: UInt64
+    var xsu_avail: UInt64
+    var xsu_pagesize: UInt32
+    var xsu_encrypted: Int32
+}
 
 // MARK: - Model Profile
 
@@ -80,8 +90,10 @@ struct SystemProfile: Sendable {
     let totalRAMBytes: UInt64
     let gpuArchitecture: String
     let recommendedWorkingSetBytes: Int
+    let swapUsedBytes: UInt64
 
     var totalRAMGB: Double { Double(totalRAMBytes) / 1e9 }
+    var swapUsedGB: Double { Double(swapUsedBytes) / 1e9 }
     /// RAM available for the model after reserving space for macOS (~4GB)
     var availableRAMGB: Double { max(0, totalRAMGB - 4.0) }
 }
@@ -204,6 +216,9 @@ enum ModelProfiler {
         let headDim: Int?
         let intermediateSize: Int?
         let vocabSize: Int?
+        let numExperts: Int?
+        let numExpertsAlt: Int?
+        let numExpertsPerTok: Int?
 
         enum CodingKeys: String, CodingKey {
             case numHiddenLayers = "num_hidden_layers"
@@ -213,6 +228,9 @@ enum ModelProfiler {
             case headDim = "head_dim"
             case intermediateSize = "intermediate_size"
             case vocabSize = "vocab_size"
+            case numExperts = "num_local_experts"
+            case numExpertsAlt = "num_experts"
+            case numExpertsPerTok = "num_experts_per_tok"
         }
     }
 
@@ -252,9 +270,9 @@ enum ModelProfiler {
         let quantBits = config.quantizationConfig?.bits ?? detectQuantBits(modelId: modelId)
 
         // Detect MoE
-        let isMoE = config.numExperts != nil && (config.numExperts ?? 0) > 1
-        let numExperts = config.numExperts
-        let numActiveExperts = config.numExpertsPerTok
+        let numExperts = config.numExperts ?? config.textConfig?.numExperts ?? config.textConfig?.numExpertsAlt
+        let isMoE = (numExperts ?? 0) > 1
+        let numActiveExperts = config.numExpertsPerTok ?? config.textConfig?.numExpertsPerTok
 
         // Measure weight file sizes on disk
         let weightSize = measureWeightFiles(directory: modelDirectory)
@@ -333,10 +351,17 @@ enum ModelProfiler {
         let physicalBudget = Int(Double(totalRAM) * 0.85) - (4 * 1024 * 1024 * 1024)
         let recommended = max(physicalBudget, Int(deviceInfo.memorySize))
 
+        // Get swap usage
+        var swap = xsw_usage(xsu_total: 0, xsu_used: 0, xsu_avail: 0, xsu_pagesize: 0, xsu_encrypted: 0)
+        var size = MemoryLayout<xsw_usage>.size
+        let result = sysctlbyname("vm.swapusage", &swap, &size, nil, 0)
+        let swapUsed = (result == 0) ? swap.xsu_used : 0
+
         return SystemProfile(
             totalRAMBytes: totalRAM,
             gpuArchitecture: deviceInfo.architecture,
-            recommendedWorkingSetBytes: recommended
+            recommendedWorkingSetBytes: recommended,
+            swapUsedBytes: swapUsed
         )
     }
 
@@ -349,24 +374,29 @@ enum ModelProfiler {
             : model.estimatedParamsB * (Double(model.quantBits) / 8.0)
         let draftGB = Double(draftWeightBytes) / 1e9
         let kvGB = model.kvCacheMemoryGB(contextLength: contextSize)
+        
+        // MoE Hardening: Reserve a safety buffer for expert activation spikes.
+        // Even with SSD streaming, some buffers are resident.
+        let moeBufferGB = model.isMoE ? 2.0 : 0.0
+        
         let overheadFactor = 1.2
-        let totalGB = (weightGB + draftGB) * overheadFactor + kvGB
+        let totalRequiredGB = (weightGB + draftGB) * overheadFactor + kvGB + moeBufferGB
         let availableGB = system.availableRAMGB
-        let overcommit = totalGB / availableGB
+        let overcommit = totalRequiredGB / availableGB
 
         var warnings: [String] = []
 
         // Determine strategy
         let strategy: PartitionStrategy
-        if totalGB <= availableGB * 0.85 {
+        if totalRequiredGB <= availableGB * 0.85 {
             strategy = .fullGPU
-        } else if totalGB <= availableGB {
+        } else if totalRequiredGB <= availableGB {
             strategy = .fullGPU
-            warnings.append("Model uses >\(Int(totalGB / availableGB * 100))% of available RAM. Performance may degrade under memory pressure.")
-        } else if totalGB <= availableGB * 2.0 {
+            warnings.append("Model uses >\(Int(totalRequiredGB / availableGB * 100))% of available RAM. Performance may degrade under memory pressure.")
+        } else if totalRequiredGB <= availableGB * 2.0 {
             strategy = .swapAssisted
             warnings.append("Model exceeds RAM by \(Int((overcommit - 1) * 100))%. macOS swap will be used. Expect 2-4× slowdown.")
-        } else if totalGB <= availableGB * 4.0 {
+        } else if totalRequiredGB <= availableGB * 4.0 {
             strategy = .layerPartitioned
             warnings.append("Model is \(String(format: "%.1f", overcommit))× system RAM. Layer partitioning needed for usable performance.")
             warnings.append("GPU/CPU layer split is not yet available in MLX Swift. Falling back to swap-assisted mode.")
@@ -431,7 +461,7 @@ enum ModelProfiler {
             strategy: strategy,
             weightMemoryGB: weightGB,
             kvCacheMemoryGB: kvGB,
-            totalRequiredGB: totalGB,
+            totalRequiredGB: totalRequiredGB,
             systemRAMGB: system.totalRAMGB,
             availableRAMGB: availableGB,
             overcommitRatio: overcommit,
diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
index d1298ac2..2827a24f 100644
--- a/Sources/SwiftLM/Server.swift
+++ b/Sources/SwiftLM/Server.swift
@@ -452,6 +452,9 @@ struct MLXServer: AsyncParsableCommand {
            let profile = mainModelProfile ?? ModelProfiler.profile(modelDirectory: modelDir, modelId: modelId)
            if let profile = profile {
             let system = ModelProfiler.systemProfile()
+            if system.swapUsedGB > 1.0 {
+                print("[SwiftLM] ⚠️  High swap usage detected: \(String(format: "%.1f", system.swapUsedGB))GB used. Performance may be degraded.")
+            }
             let contextSize = self.ctxSize ?? 4096
             let plan = ModelProfiler.plan(model: profile, system: system, contextSize: contextSize, draftWeightBytes: draftFootprintBytes)
             partitionPlan = plan
@@ -471,7 +474,7 @@ struct MLXServer: AsyncParsableCommand {
                     // SSD Streaming: expert weights are mmap'd from SSD via the OS page cache.
                     // No swap involved — the page cache evicts stale expert pages cleanly.
                     // draftFootprintBytes pre-computed once above (Copilot review).
-                    let physicalBudget = computeSSDMemoryBudget(totalRAMBytes: system.totalRAMBytes, draftWeightBytes: draftFootprintBytes)
+                    let physicalBudget = computeSSDMemoryBudget(totalRAMBytes: system.totalRAMBytes, isMoE: profile.isMoE, draftWeightBytes: draftFootprintBytes)
                     Memory.cacheLimit = physicalBudget
                     print("[SwiftLM] 💾 Memory strategy: SSD STREAMING (page-cache managed, \(physicalBudget / (1024*1024*1024))GB RAM budget, no swap)")
                 } else {
@@ -482,7 +485,7 @@ struct MLXServer: AsyncParsableCommand {
             case .layerPartitioned:
                 if self.streamExperts {
                     // draftFootprintBytes pre-computed once above (Copilot review).
-                    let physicalBudget = computeSSDMemoryBudget(totalRAMBytes: system.totalRAMBytes, draftWeightBytes: draftFootprintBytes)
+                    let physicalBudget = computeSSDMemoryBudget(totalRAMBytes: system.totalRAMBytes, isMoE: profile.isMoE, draftWeightBytes: draftFootprintBytes)
                     Memory.cacheLimit = physicalBudget
                     print("[SwiftLM] 💾 Memory strategy: SSD STREAMING (page-cache managed, \(physicalBudget / (1024*1024*1024))GB RAM budget, no swap)")
                 } else {
@@ -726,7 +729,7 @@ struct MLXServer: AsyncParsableCommand {
                     print("[SwiftLM] 🚀 PAPPS 16-Worker Thread Pool prefetcher enabled!")
                 }
             } else {
-                print("[SwiftLM] ⚠️  Model does not support SSD expert streaming")
+                print("[SwiftLM] ⚠️  Model does not support SSD expert streaming (Check architecture for QuantizedSwitchLinear layers)")
             }
         }
 
@@ -1058,9 +1061,10 @@ struct ServerConfig: Sendable {
 ///     Subtracted so the draft model's resident pages don't push the main model's
 ///     page cache over the physical limit and trigger swap (Issue #72).
 /// - Returns: The recommended `Memory.cacheLimit` value in bytes.
-func computeSSDMemoryBudget(totalRAMBytes: UInt64, draftWeightBytes: Int = 0) -> Int {
+func computeSSDMemoryBudget(totalRAMBytes: UInt64, isMoE: Bool = false, draftWeightBytes: Int = 0) -> Int {
     let osHeadroom = 4 * 1024 * 1024 * 1024  // 4 GB for OS + system processes
-    let raw = Int(Double(totalRAMBytes) * 0.85) - osHeadroom - draftWeightBytes
+    let moeBuffer = isMoE ? 2 * 1024 * 1024 * 1024 : 0 // 2 GB safety buffer for dynamic expert activation
+    let raw = Int(Double(totalRAMBytes) * 0.85) - osHeadroom - draftWeightBytes - moeBuffer
     return max(raw, 2 * 1024 * 1024 * 1024)  // floor at 2 GB
 }
 
@@ -1180,24 +1184,7 @@ actor PromptCache {
         if cache.contains(where: { $0 is MambaCache }) {
             return
         }
-        let P = tokens.count
-        // For attention KVCacheSimple layers, the state tensor is [B, H, T, D] with a
-        // pre-allocated T that can exceed the actual prompt length P. If we store the
-        // full over-sized buffer, restore()'s trim() by (cached.tokens.count - matchLen)
-        // still leaves T - P slots of garbage beyond the valid prefix. Slice T to P at
-        // save time so cached.tokens.count === cached state's T.
-        let states: [[MLXArray]] = cache.map { layer -> [MLXArray] in
-            let s = layer.state
-            if layer is KVCacheSimple {
-                return s.map { arr -> MLXArray in
-                    guard arr.ndim >= 3 else { return arr }
-                    let T = arr.dim(2)
-                    if T > P { return arr[.ellipsis, ..<P, 0...] }
-                    return arr
-                }
-            }
-            return s
-        }
+        let states = cache.map { $0.state }
         let metaStates = cache.map { $0.metaState }
         // Materialize all lazy MLX arrays so they survive cache mutations
         let allArrays = states.flatMap { $0 }
@@ -1223,20 +1210,6 @@ actor PromptCache {
             misses += 1
             return nil
         }
-        // ── Recurrent-layer safety gate ──
-        // MambaCache (and other recurrent caches) store a 2-D hidden state with no
-        // T dimension, so the dim(2) read below would crash. Hybrid Mamba/attention
-        // models (Qwen-Next, Mamba-2, etc.) can't be safely prefix-restored because
-        // the recurrent hidden state was computed over the WHOLE previous sequence
-        // and there is no trim(excess) operator for it. Treat any cache containing
-        // a recurrent layer as a miss before we touch anything.
-        let hasRecurrentLayer = cache.contains { layer in
-            !(layer is KVCacheSimple) && !(String(describing: type(of: layer)).contains("Rotating"))
-        }
-        if hasRecurrentLayer {
-            misses += 1
-            return nil
-        }
         // Token-by-token longest common prefix scan
         var matchLen = 0
         for (a, b) in zip(cached.tokens, newTokens) {
@@ -1257,7 +1230,6 @@ actor PromptCache {
             // dim(2) = T = the number of cached tokens for that layer.
             let minCachedSeqLen = cached.states.map { arrays -> Int in
                 guard let firstArray = arrays.first else { return 0 }
-                guard firstArray.ndim >= 3 else { return 0 }
                 return firstArray.dim(2)  // T dimension
             }.min() ?? 0
             if excess >= minCachedSeqLen {
@@ -1368,9 +1340,8 @@ func handleChatCompletion(
         case "assistant":
             var formattedToolCalls: [[String: any Sendable]]? = nil
             if let tc = msg.tool_calls, !tc.isEmpty {
-                formattedToolCalls = tc.enumerated().map { (index, call) in
+                formattedToolCalls = tc.map { call in
                     [
-                        "index": index,
                         "id": call.id,
                         "type": call.type,
                         "function": [
@@ -1553,25 +1524,13 @@ func handleChatCompletion(
         // raw <|image|>/<|audio|> token embeddings instead of the projected features.
         let isMultimodalRequest = lmInput.image != nil || lmInput.audio != nil
 
-        // ── Decision branch ──
-        // Speculative decoding is CHECKED FIRST because a cache-hit rollback
-        // corrupts the draft model's KV state (draft and main model cycle tokens
-        // in lock-step). We'd rather pay the prefill than emit garbage.
-        //
-        // Skip prompt cache for quantized-KV requests: the prompt cache stores KV state
-        // produced with KVCacheSimple; restoring it into a QuantizedKVCache (or vice-versa)
+        // Try to restore via token-by-token prefix match (llama-server style).
+        // Skip for quantized-KV requests: the prompt cache stores KV state produced
+        // with KVCacheSimple; restoring it into a QuantizedKVCache (or vice-versa)
         // is unsafe and produces incorrect results or runtime failures.
         let skipPromptCache = isMultimodalRequest || params.kvBits != nil
         var stream: AsyncStream<Generation>
-        if let draftRef = draftModelRef {
-            // Speculative decoding path: draft model generates candidates, main model verifies.
-            // Bypass prompt cache to avoid draft/main KV drift on partial-match restores.
-            print("[SwiftLM] Using speculative decoding (\(numDraftTokens) draft tokens/round)")
-            stream = try MLXLMCommon.generate(
-                input: lmInput, cache: cache, parameters: params, context: context,
-                draftModel: draftRef.model, numDraftTokens: numDraftTokens
-            )
-        } else if !skipPromptCache, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) {
+        if !skipPromptCache, let cachedCount = await promptCache.restore(newTokens: promptTokens, into: cache) {
             // Cache hit: KV state is pre-populated up to cachedCount tokens.
             // Only compute the remaining (new) tokens.
             var startIndex = cachedCount
diff --git a/SwiftBuddy/SwiftBuddy/SwiftBuddyApp.swift b/SwiftBuddy/SwiftBuddy/SwiftBuddyApp.swift
index 1c9b9190..929b3072 100644
--- a/SwiftBuddy/SwiftBuddy/SwiftBuddyApp.swift
+++ b/SwiftBuddy/SwiftBuddy/SwiftBuddyApp.swift
@@ -54,8 +54,8 @@ struct SwiftBuddyApp: App {
         .commands {
             CommandGroup(replacing: .newItem) {}
             CommandMenu("Model") {
-                Button("Load Model...") {
-                    NotificationCenter.default.post(name: .showModelManagement, object: nil)
+                Button("Choose Model…") {
+                    NotificationCenter.default.post(name: .showModelPicker, object: nil)
                 }.keyboardShortcut("m", modifiers: [.command, .shift])
                 Button("Unload Model") {
                     engine.unload()
@@ -72,6 +72,7 @@ struct SwiftBuddyApp: App {
 }
 
 extension Notification.Name {
+    static let showModelPicker = Notification.Name("showModelPicker")
     static let showTextIngestion = Notification.Name("showTextIngestion")
     static let showPersonaDiscovery = Notification.Name("showPersonaDiscovery")
     static let showModelManagement = Notification.Name("showModelManagement")
@@ -90,7 +91,6 @@ struct MainContentView: View {
     var body: some View {
         RootView()
             .environmentObject(engine)
-            .environmentObject(engine.downloadManager)
             .environmentObject(appearance)
             .environmentObject(server)
             .preferredColorScheme(appearance.colorScheme)
@@ -99,9 +99,7 @@ struct MainContentView: View {
             .onAppear {
                 MemoryPalaceService.shared.modelContext = modelContext
                 GraphPalaceService.shared.modelContext = modelContext
-                if server.startupConfiguration.autoStart {
-                    server.start(engine: engine)
-                }
+                server.start(engine: engine)
                 
                 // Pre-load the JSON personas so the UI Wings instantly populate!
                 PersonaLoader.loadDevDefaults()
diff --git a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
index c76c9175..d7240b98 100644
--- a/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
+++ b/SwiftBuddy/SwiftBuddy/ViewModels/ServerManager.swift
@@ -1,207 +1,55 @@
 import Foundation
-import HTTPTypes
 import Hummingbird
 import NIOCore
 #if canImport(MLXInferenceCore)
 import MLXInferenceCore
 #endif
 
-struct ServerStartupConfiguration: Codable, Equatable, Sendable {
-    var autoStart: Bool = true
-    var host: String = "127.0.0.1"
-    var port: Int = 5413
-    var parallelSlots: Int = 1
-    var corsOrigin: String = ""
-    var apiKey: String = ""
-
-    private static let storageKey = "swiftlm.server.startupConfiguration"
-
-    var normalized: ServerStartupConfiguration {
-        var copy = self
-        copy.host = copy.host.trimmingCharacters(in: .whitespacesAndNewlines)
-        if copy.host.isEmpty { copy.host = "127.0.0.1" }
-        copy.port = min(max(copy.port, 1), 65_535)
-        copy.parallelSlots = max(copy.parallelSlots, 1)
-        copy.corsOrigin = copy.corsOrigin.trimmingCharacters(in: .whitespacesAndNewlines)
-        copy.apiKey = copy.apiKey.trimmingCharacters(in: .whitespacesAndNewlines)
-        return copy
-    }
-
-    static func load() -> ServerStartupConfiguration {
-        guard let data = UserDefaults.standard.data(forKey: storageKey),
-              let decoded = try? JSONDecoder().decode(ServerStartupConfiguration.self, from: data) else {
-            return ServerStartupConfiguration()
-        }
-        return decoded.normalized
-    }
-
-    func save() {
-        guard let data = try? JSONEncoder().encode(normalized) else { return }
-        UserDefaults.standard.set(data, forKey: Self.storageKey)
-    }
-}
-
-private var swiftBuddyJSONHeaders: HTTPFields {
-    HTTPFields([HTTPField(name: .contentType, value: "application/json")])
-}
-
-private func swiftBuddyJSONString(_ value: String) -> String {
-    guard let data = try? JSONEncoder().encode(value),
-          let string = String(data: data, encoding: .utf8) else {
-        return "\"\""
-    }
-    return string
-}
-
-private struct SwiftBuddyCORSMiddleware<Context: RequestContext>: RouterMiddleware {
-    let allowedOrigin: String
-
-    func handle(_ request: Request, context: Context, next: (Request, Context) async throws -> Response) async throws -> Response {
-        if request.method == .options {
-            return Response(status: .noContent, headers: corsHeaders(for: request))
-        }
-
-        var response = try await next(request, context)
-        for field in corsHeaders(for: request) {
-            response.headers.append(field)
-        }
-        return response
-    }
-
-    private func corsHeaders(for request: Request) -> HTTPFields {
-        var fields: [HTTPField] = []
-        if allowedOrigin == "*" {
-            fields.append(HTTPField(name: HTTPField.Name("Access-Control-Allow-Origin")!, value: "*"))
-        } else {
-            let requestOrigin = request.headers[values: HTTPField.Name("Origin")!].first ?? ""
-            if requestOrigin == allowedOrigin {
-                fields.append(HTTPField(name: HTTPField.Name("Access-Control-Allow-Origin")!, value: allowedOrigin))
-                fields.append(HTTPField(name: HTTPField.Name("Vary")!, value: "Origin"))
-            }
-        }
-        fields.append(HTTPField(name: HTTPField.Name("Access-Control-Allow-Methods")!, value: "GET, POST, OPTIONS"))
-        fields.append(HTTPField(name: HTTPField.Name("Access-Control-Allow-Headers")!, value: "Content-Type, Authorization, X-SwiftLM-Prefill-Progress"))
-        return HTTPFields(fields)
-    }
-}
-
-private struct SwiftBuddyAPIKeyMiddleware<Context: RequestContext>: RouterMiddleware {
-    let apiKey: String
-
-    func handle(_ request: Request, context: Context, next: (Request, Context) async throws -> Response) async throws -> Response {
-        let path = request.uri.path
-        if path == "/health" || path == "/metrics" {
-            return try await next(request, context)
-        }
-
-        let authHeader = request.headers[values: .authorization].first ?? ""
-        if authHeader == "Bearer \(apiKey)" || authHeader == apiKey {
-            return try await next(request, context)
-        }
-
-        return Response(
-            status: .unauthorized,
-            headers: swiftBuddyJSONHeaders,
-            body: .init(byteBuffer: ByteBuffer(string: #"{"error":{"message":"Invalid API key","type":"invalid_request_error","code":"invalid_api_key"}}"#))
-        )
-    }
-}
-
 @MainActor
 final class ServerManager: ObservableObject {
     @Published var isOnline = false
-    @Published var host: String = "127.0.0.1"
-    @Published var port: Int = 5413
-    @Published private(set) var startupConfiguration: ServerStartupConfiguration
-    @Published private(set) var runningConfiguration: ServerStartupConfiguration?
-    @Published private(set) var restartRequired = false
+    @Published var port: Int = 8080
     
     // In a real implementation this would hold the Hummingbird App and tie into `engine`
     private var task: Task<Void, Never>?
-
-    init() {
-        let configuration = ServerStartupConfiguration.load()
-        self.startupConfiguration = configuration
-        self.host = configuration.host
-        self.port = configuration.port
-    }
-
+    
     func start(engine: InferenceEngine) {
         guard !isOnline else { return }
-        let configuration = startupConfiguration.normalized
-
+        
         task = Task {
             do {
                 let router = Router()
-
-                if !configuration.corsOrigin.isEmpty {
-                    router.add(middleware: SwiftBuddyCORSMiddleware(allowedOrigin: configuration.corsOrigin))
-                }
-
-                if !configuration.apiKey.isEmpty {
-                    router.add(middleware: SwiftBuddyAPIKeyMiddleware(apiKey: configuration.apiKey))
-                }
-
+                
                 router.get("/health") { _, _ -> Response in
-                    let body = """
-                    {"status":"ok","message":"SwiftBuddy Local Server","host":\(swiftBuddyJSONString(configuration.host)),"port":\(configuration.port),"parallel":\(configuration.parallelSlots),"cors":\(swiftBuddyJSONString(configuration.corsOrigin.isEmpty ? "disabled" : configuration.corsOrigin)),"auth":"\(configuration.apiKey.isEmpty ? "disabled" : "enabled")"}
-                    """
-                    let buffer = ByteBuffer(string: body)
-                    return Response(status: .ok, headers: swiftBuddyJSONHeaders, body: .init(byteBuffer: buffer))
+                    let buffer = ByteBuffer(string: #"{"status": "ok", "message": "SwiftBuddy Local Server"}"#)
+                    return Response(status: .ok, body: .init(byteBuffer: buffer))
                 }
-
+                
                 // Simple V1 models mock
                 router.get("/v1/models") { _, _ -> Response in
                     let buffer = ByteBuffer(string: #"{"object": "list", "data": [{"id": "local", "object": "model"}]}"#)
-                    return Response(status: .ok, headers: swiftBuddyJSONHeaders, body: .init(byteBuffer: buffer))
+                    return Response(status: .ok, body: .init(byteBuffer: buffer))
                 }
                 
                 let app = Application(
                     router: router,
-                    configuration: .init(address: .hostname(configuration.host, port: configuration.port))
+                    configuration: .init(address: .hostname("127.0.0.1", port: 8080))
                 )
-
+                
                 self.isOnline = true
-                self.host = configuration.host
-                self.port = configuration.port
-                self.runningConfiguration = configuration
-                self.restartRequired = false
-                ConsoleLog.shared.info("Server online at http://\(configuration.host):\(configuration.port)")
-
+                self.port = 8080
+                
                 try await app.runService()
             } catch {
                 print("Server failed: \(error)")
-                ConsoleLog.shared.error("Server failed: \(error.localizedDescription)")
                 self.isOnline = false
             }
         }
     }
-
-    @discardableResult
-    func saveStartupConfiguration(_ configuration: ServerStartupConfiguration) -> Bool {
-        let normalized = configuration.normalized
-        let changed = normalized != startupConfiguration
-        startupConfiguration = normalized
-        host = normalized.host
-        port = normalized.port
-        normalized.save()
-        restartRequired = isOnline && runningConfiguration != nil && runningConfiguration != normalized
-        if changed {
-            ConsoleLog.shared.info("Server startup configuration saved")
-        }
-        return changed
-    }
-
-    func restart(engine: InferenceEngine) {
-        stop()
-        start(engine: engine)
-    }
-
+    
     func stop() {
         task?.cancel()
         task = nil
         isOnline = false
-        runningConfiguration = nil
-        restartRequired = false
     }
 }
diff --git a/SwiftBuddy/SwiftBuddy/Views/ChatView.swift b/SwiftBuddy/SwiftBuddy/Views/ChatView.swift
index 8bded67a..9d7193c1 100644
--- a/SwiftBuddy/SwiftBuddy/Views/ChatView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/ChatView.swift
@@ -12,7 +12,7 @@ struct ChatView: View {
 
     // macOS-only sheet control (iOS: these are tabs)
     var showSettings: Binding<Bool>? = nil
-    @State private var showCorruptionRecoveryAlert = false
+    var showModelPicker: Binding<Bool>? = nil
     @State private var inputText = ""
     @FocusState private var inputFocused: Bool
 
@@ -56,29 +56,6 @@ struct ChatView: View {
         #else
         .toolbar { macOSToolbar }
         #endif
-        .alert(
-            "Model Download Problem",
-            isPresented: $showCorruptionRecoveryAlert,
-            actions: {
-                Button("Delete & Re-download", role: .destructive) {
-                    engine.deleteCorruptedAndRedownload()
-                }
-                Button("Choose Another Model") {
-                    engine.clearCorruptionRecovery()
-                    NotificationCenter.default.post(name: .showModelManagement, object: nil)
-                }
-                Button("Cancel", role: .cancel) {
-                }
-            },
-            message: {
-                Text("The downloaded weights for this model are corrupted or incomplete. Delete the cached files and start a fresh download, or choose a different model.")
-            }
-        )
-        .onChange(of: engine.corruptedModelId) { _, modelId in
-            if modelId != nil {
-                showCorruptionRecoveryAlert = true
-            }
-        }
     }
 
     // MARK: — Message List
@@ -168,21 +145,6 @@ struct ChatView: View {
                     .foregroundStyle(SwiftBuddyTheme.textSecondary)
                     .multilineTextAlignment(.center)
                     .padding(.horizontal, 40)
-
-                if engine.corruptedModelId != nil {
-                    Button(role: .destructive) {
-                        engine.deleteCorruptedAndRedownload()
-                    } label: {
-                        Label("Delete Corrupted Model & Re-download", systemImage: "arrow.down.app.fill")
-                            .font(.subheadline.weight(.semibold))
-                            .padding(.horizontal, 8)
-                            .padding(.vertical, 4)
-                    }
-                    .buttonStyle(.borderedProminent)
-                    .tint(SwiftBuddyTheme.error.opacity(0.15))
-                    .foregroundStyle(SwiftBuddyTheme.error)
-                    .padding(.top, 10)
-                }
             }
 
         case .ready, .generating:
@@ -280,33 +242,7 @@ struct ChatView: View {
             .padding(.vertical, 8)
             .background(SwiftBuddyTheme.surface.opacity(0.90))
         case .error(let msg):
-            if engine.corruptedModelId != nil {
-                VStack(alignment: .leading, spacing: 6) {
-                    HStack {
-                        Image(systemName: "exclamationmark.triangle.fill")
-                            .foregroundStyle(SwiftBuddyTheme.error)
-                        Text(msg)
-                            .font(.caption)
-                            .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                    }
-                    Button(role: .destructive) {
-                        engine.deleteCorruptedAndRedownload()
-                    } label: {
-                        Label("Delete Corrupted Model & Re-download", systemImage: "arrow.down.app.fill")
-                            .font(.caption.weight(.semibold))
-                    }
-                    .buttonStyle(.borderedProminent)
-                    .tint(SwiftBuddyTheme.error.opacity(0.2))
-                    .foregroundStyle(SwiftBuddyTheme.error)
-                    .controlSize(.small)
-                }
-                .padding(.horizontal, 16)
-                .padding(.vertical, 8)
-                .frame(maxWidth: .infinity, alignment: .leading)
-                .background(SwiftBuddyTheme.error.opacity(0.1))
-            } else {
-                bannerRow(icon: "exclamationmark.triangle.fill", text: msg, color: SwiftBuddyTheme.error)
-            }
+            bannerRow(icon: "exclamationmark.triangle.fill", text: msg, color: SwiftBuddyTheme.error)
         case .ready, .generating:
             if engine.maxContextWindow > 0 && !viewModel.messages.isEmpty {
                 HStack {
@@ -491,6 +427,19 @@ struct ChatView: View {
     #if os(macOS)
     @ToolbarContentBuilder
     private var macOSToolbar: some ToolbarContent {
+        ToolbarItem {
+            Menu {
+                Button("No Persona") { viewModel.currentWing = nil }
+                Divider()
+                ForEach(wings) { wing in
+                    Button(wing.name) { viewModel.currentWing = wing.name }
+                }
+            } label: {
+                Image(systemName: viewModel.currentWing == nil ? "brain" : "brain.head.profile")
+                    .foregroundStyle(viewModel.currentWing == nil ? SwiftBuddyTheme.textSecondary : .orange)
+            }
+        }
+        
         ToolbarItem {
             Button { withAnimation { viewModel.clearHistory() } } label: {
                 Label("Clear Chat", systemImage: "trash")
diff --git a/SwiftBuddy/SwiftBuddy/Views/InspectorView.swift b/SwiftBuddy/SwiftBuddy/Views/InspectorView.swift
index bfeacc36..7b9853ce 100644
--- a/SwiftBuddy/SwiftBuddy/Views/InspectorView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/InspectorView.swift
@@ -6,8 +6,7 @@ import SwiftData
 
 struct InspectorView: View {
     @EnvironmentObject private var engine: InferenceEngine
-    @EnvironmentObject private var server: ServerManager
-    @Binding var showModelManagement: Bool
+    @Binding var showModelPicker: Bool
     
     @Query(sort: \PalaceWing.name) var wings: [PalaceWing]
     @StateObject private var registryService = RegistryService.shared
@@ -28,21 +27,19 @@ struct InspectorView: View {
                         
                         HStack {
                             Circle()
-                                .fill(server.isOnline ? Color.green : Color.orange)
+                                .fill(Color.green)
                                 .frame(width: 8, height: 8)
-                            Text(server.isOnline ? "Online" : "Offline")
+                            Text("Online")
                                 .font(.subheadline)
                                 .bold()
-                                .foregroundStyle(server.isOnline ? .green : .orange)
+                                .foregroundStyle(.green)
                             Spacer()
-                            Text("\(server.host):\(server.port)")
+                            Text("Port 8080")
                                 .font(.caption.monospaced())
                                 .foregroundStyle(.secondary)
                         }
                         
-                        Text(server.isOnline
-                             ? "Ready for /v1/chat/completions requests."
-                             : "Server will use saved SwiftLM startup settings when started.")
+                        Text("Ready for /v1/chat/completions requests.")
                             .font(.caption2)
                             .foregroundStyle(.secondary)
                             .padding(.top, 4)
@@ -80,7 +77,7 @@ struct InspectorView: View {
                                 .foregroundStyle(.secondary)
                             
                             Button("Load Model") {
-                                showModelManagement = true
+                                showModelPicker = true
                             }
                             .buttonStyle(.borderedProminent)
                             //.tint(SwiftBuddyTheme.accent)
diff --git a/SwiftBuddy/SwiftBuddy/Views/ModelManagementView.swift b/SwiftBuddy/SwiftBuddy/Views/ModelManagementView.swift
index e59cd66e..7d8cd24a 100644
--- a/SwiftBuddy/SwiftBuddy/Views/ModelManagementView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/ModelManagementView.swift
@@ -11,7 +11,7 @@ struct ModelManagementView: View {
     @State private var showHFSearch = false
     @State private var deletionError: String? = nil
 
-    @EnvironmentObject private var dm: ModelDownloadManager
+    private var dm: ModelDownloadManager { engine.downloadManager }
 
     var body: some View {
         NavigationStack {
@@ -59,17 +59,6 @@ struct ModelManagementView: View {
             }, message: {
                 Text(deletionError ?? "")
             })
-            .safeAreaInset(edge: .bottom) {
-                if let (modelId, progress) = dm.activeDownloads.first {
-                    FloatingDownloadBanner(modelId: modelId, progress: progress)
-                        .padding(.vertical, 8)
-                        .background(
-                            Rectangle()
-                                .fill(SwiftBuddyTheme.background)
-                                .shadow(color: .black.opacity(0.1), radius: 5, y: -2)
-                        )
-                }
-            }
             .sheet(isPresented: $showHFSearch) {
                 NavigationStack {
                     ZStack {
@@ -93,18 +82,6 @@ struct ModelManagementView: View {
                         }
                     }
                 }
-                .safeAreaInset(edge: .bottom) {
-                    if let (modelId, progress) = dm.activeDownloads.first {
-                        FloatingDownloadBanner(modelId: modelId, progress: progress)
-                            .padding(.vertical, 8)
-                            // Match the background style of the floating banner in chat
-                            .background(
-                                Rectangle()
-                                    .fill(SwiftBuddyTheme.background)
-                                    .shadow(color: .black.opacity(0.1), radius: 5, y: -2)
-                            )
-                    }
-                }
             }
         }
         #if os(macOS)
diff --git a/SwiftBuddy/SwiftBuddy/Views/ModelPickerView.swift b/SwiftBuddy/SwiftBuddy/Views/ModelPickerView.swift
index fd66b0b1..50974cb3 100644
--- a/SwiftBuddy/SwiftBuddy/Views/ModelPickerView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/ModelPickerView.swift
@@ -406,9 +406,9 @@ private struct HFModelRow: View {
     let onSelect: (String) -> Void
     
     @EnvironmentObject private var engine: InferenceEngine
-    @EnvironmentObject private var downloadManager: ModelDownloadManager
     @State private var pendingLoad = false
     
+    private var downloadManager: ModelDownloadManager { engine.downloadManager }
     private var isDownloaded: Bool { downloadManager.isDownloaded(model.id) }
     private var activeProgress: ModelDownloadProgress? { downloadManager.activeDownloads[model.id] }
 
diff --git a/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift b/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift
index ac0cfc1a..8db0c213 100644
--- a/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/ModelsView.swift
@@ -12,7 +12,6 @@ struct ModelsView: View {
     @State private var showHFSearch = false
     @State private var showManagement = false
     @State private var device = DeviceProfile.current
-    @State private var deleteErrorMessage: String?
 
     private var dm: ModelDownloadManager { engine.downloadManager }
 
@@ -40,34 +39,6 @@ struct ModelsView: View {
                         }
                     }
 
-                    // ── 2b. Incomplete (resumable) downloads ────────────────
-                    if !dm.incompleteDownloads.isEmpty {
-                        sectionHeader("Resume Downloads")
-                        VStack(spacing: 0) {
-                            ForEach(dm.incompleteDownloads) { incomplete in
-                                IncompleteDownloadRow(
-                                    incomplete: incomplete,
-                                    onResume: { dm.resumeDownload(modelId: incomplete.id) },
-                                    onDelete: { deleteModel(incomplete.id) }
-                                )
-                                .padding(.horizontal)
-                                if incomplete.id != dm.incompleteDownloads.last?.id {
-                                    Divider()
-                                        .background(SwiftBuddyTheme.divider)
-                                        .padding(.leading, 72)
-                                }
-                            }
-                        }
-                        .background(SwiftBuddyTheme.surface.opacity(0.60))
-                        .clipShape(RoundedRectangle(cornerRadius: SwiftBuddyTheme.radiusMedium))
-                        .overlay(
-                            RoundedRectangle(cornerRadius: SwiftBuddyTheme.radiusMedium)
-                                .strokeBorder(SwiftBuddyTheme.warning.opacity(0.20), lineWidth: 1)
-                        )
-                        .padding(.horizontal)
-                        .padding(.bottom, 10)
-                    }
-
                     // ── 3. Downloaded models ───────────────────────────────
                     if !dm.downloadedModels.isEmpty {
                         sectionHeader("Downloaded (\(dm.downloadedModels.count))")
@@ -84,7 +55,7 @@ struct ModelsView: View {
                                     entry: entry,
                                     isActive: isActive,
                                     onLoad: { Task { await engine.load(modelId: downloaded.id) } },
-                                    onDelete: { deleteModel(downloaded.id) }
+                                    onDelete: { try? dm.delete(downloaded.id) }
                                 )
                                 .padding(.horizontal)
                                 if downloaded.id != dm.downloadedModels.last?.id {
@@ -208,25 +179,6 @@ struct ModelsView: View {
             ModelManagementView()
                 .environmentObject(engine)
         }
-        .alert(
-            "Delete Failed",
-            isPresented: Binding(
-                get: { deleteErrorMessage != nil },
-                set: { if !$0 { deleteErrorMessage = nil } }
-            )
-        ) {
-            Button("OK", role: .cancel) {}
-        } message: {
-            Text(deleteErrorMessage ?? "")
-        }
-    }
-
-    private func deleteModel(_ modelId: String) {
-        do {
-            try dm.delete(modelId)
-        } catch {
-            deleteErrorMessage = error.localizedDescription
-        }
     }
 
     // MARK: — Helpers
@@ -505,71 +457,6 @@ private struct DownloadProgressCard: View {
     }
 }
 
-// MARK: — Incomplete Download Row
-
-private struct IncompleteDownloadRow: View {
-    let incomplete: ModelStorage.IncompleteDownload
-    let onResume: () -> Void
-    let onDelete: () -> Void
-
-    var body: some View {
-        HStack(spacing: 12) {
-            ZStack {
-                RoundedRectangle(cornerRadius: 10)
-                    .fill(SwiftBuddyTheme.warning.opacity(0.15))
-                    .frame(width: 44, height: 44)
-                Image(systemName: "arrow.clockwise.circle.fill")
-                    .font(.title3)
-                    .foregroundStyle(SwiftBuddyTheme.warning)
-            }
-
-            VStack(alignment: .leading, spacing: 3) {
-                Text(incomplete.id.components(separatedBy: "/").last ?? incomplete.id)
-                    .font(.subheadline.weight(.semibold))
-                    .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                    .lineLimit(1)
-                HStack(spacing: 6) {
-                    Text(formatBytes(incomplete.downloadedBytes) + " downloaded")
-                        .font(.caption)
-                        .foregroundStyle(SwiftBuddyTheme.textSecondary)
-                    Text("·")
-                        .foregroundStyle(SwiftBuddyTheme.textTertiary)
-                        .font(.caption)
-                    Text("Incomplete")
-                        .font(.caption.weight(.medium))
-                        .foregroundStyle(SwiftBuddyTheme.warning)
-                }
-            }
-
-            Spacer()
-
-            Button(action: onResume) {
-                Label("Resume", systemImage: "arrow.down.circle.fill")
-                    .font(.caption.weight(.semibold))
-                    .foregroundStyle(.white)
-                    .padding(.horizontal, 12)
-                    .padding(.vertical, 6)
-                    .background(SwiftBuddyTheme.warning, in: Capsule())
-            }
-            .buttonStyle(.plain)
-        }
-        .padding(.vertical, 12)
-        .contentShape(Rectangle())
-        .swipeActions(edge: .trailing, allowsFullSwipe: false) {
-            Button(role: .destructive, action: onDelete) {
-                Label("Delete", systemImage: "trash")
-            }
-        }
-    }
-
-    private func formatBytes(_ bytes: Int64) -> String {
-        let gb = Double(bytes) / 1_073_741_824
-        let mb = Double(bytes) / 1_048_576
-        if gb >= 1.0 { return String(format: "%.1f GB", gb) }
-        return String(format: "%.0f MB", mb)
-    }
-}
-
 // MARK: — Downloaded Model Row
 
 private struct DownloadedModelRow: View {
diff --git a/SwiftBuddy/SwiftBuddy/Views/RootView.swift b/SwiftBuddy/SwiftBuddy/Views/RootView.swift
index 049e4b37..d0ada7b3 100644
--- a/SwiftBuddy/SwiftBuddy/Views/RootView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/RootView.swift
@@ -8,7 +8,6 @@ import MLXInferenceCore
 struct RootView: View {
     @EnvironmentObject private var engine: InferenceEngine
     @EnvironmentObject private var appearance: AppearanceStore
-    @EnvironmentObject private var server: ServerManager
     @Environment(\.modelContext) private var modelContext
     @StateObject private var viewModel = ChatViewModel()
     @StateObject private var registry = RegistryService.shared
@@ -18,24 +17,29 @@ struct RootView: View {
     @State private var selectedTab: Tab = .chat
 
     // macOS sheets
-
+    @State private var showModelPicker = false
     @State private var showSettings = false
     @State private var showPersonaDiscovery = false
     @State private var showMap = false
     @State private var showMindPalace = false
     @State private var showTextIngestion = false
     @State private var showModelManagement = false
-    @State private var lastDownloadLogBucket: Int?
     enum Tab { case chat, models, palace, mindPalace, miner, settings }
 
     var body: some View {
         Group {
             #if os(macOS)
             macOSLayout
+                .sheet(isPresented: $showModelPicker) {
+                    ModelPickerView(onSelect: { modelId in
+                        showModelPicker = false
+                        Task { await engine.load(modelId: modelId) }
+                    })
+                    .environmentObject(engine)
+                }
                 .sheet(isPresented: $showSettings) {
                     SettingsView(viewModel: viewModel)
                         .environmentObject(appearance)
-                        .environmentObject(server)
                 }
                 .sheet(isPresented: $showMap) {
                     PalaceVisualizerView()
@@ -52,9 +56,10 @@ struct RootView: View {
                 .sheet(isPresented: $showModelManagement) {
                     ModelManagementView()
                         .environmentObject(engine)
-                        .environmentObject(engine.downloadManager)
                 }
-
+                .onReceive(NotificationCenter.default.publisher(for: .showModelPicker)) { _ in
+                    showModelPicker = true
+                }
                 .onReceive(NotificationCenter.default.publisher(for: .showTextIngestion)) { _ in
                     showTextIngestion = true
                 }
@@ -68,31 +73,7 @@ struct RootView: View {
                     viewModel.engine = engine
                     viewModel.modelContext = modelContext
                 }
-                .onChange(of: engine.state) { oldState, newState in
-                    switch newState {
-                    case .idle:
-                        lastDownloadLogBucket = nil
-                        ConsoleLog.shared.info("Engine idle — no model loaded")
-                    case .loading:
-                        lastDownloadLogBucket = nil
-                        ConsoleLog.shared.info("Loading model…")
-                    case .downloading(let p, let speed):
-                        let percent = Int(p * 100)
-                        let bucket = min((percent / 25) * 25, 100)
-                        if bucket != lastDownloadLogBucket, [0, 25, 50, 75, 100].contains(bucket) {
-                            lastDownloadLogBucket = bucket
-                            ConsoleLog.shared.debug("Downloading: \(percent)% (\(speed))")
-                        }
-                    case .ready(let modelId):
-                        lastDownloadLogBucket = nil
-                        ConsoleLog.shared.info("✓ Model ready: \(modelId)")
-                    case .generating:
-                        lastDownloadLogBucket = nil
-                        ConsoleLog.shared.debug("Generating…")
-                    case .error(let msg):
-                        lastDownloadLogBucket = nil
-                        ConsoleLog.shared.error("Engine error: \(msg)")
-                    }
+                .onChange(of: engine.state) { _, _ in
                 }
                 .overlay {
                     if registry.isSyncing {
@@ -234,7 +215,6 @@ struct RootView: View {
             NavigationStack {
                 SettingsView(viewModel: viewModel, isTab: true)
                     .environmentObject(appearance)
-                    .environmentObject(server)
             }
             .tabItem {
                 Label("Settings", systemImage: selectedTab == .settings ? "gearshape.fill" : "gearshape")
@@ -243,7 +223,7 @@ struct RootView: View {
         }
         .tint(SwiftBuddyTheme.accent)
         // Navigate to Models tab when a model load is requested from chat
-        .onReceive(NotificationCenter.default.publisher(for: .showModelManagement)) { _ in
+        .onReceive(NotificationCenter.default.publisher(for: .showModelPicker)) { _ in
             selectedTab = .models
         }
     }
@@ -361,7 +341,8 @@ struct RootView: View {
         } detail: {
             ChatView(
                 viewModel: viewModel,
-                showSettings: $showSettings
+                showSettings: $showSettings,
+                showModelPicker: $showModelPicker
             )
             .frame(minWidth: 400)
             .background(SwiftBuddyTheme.background)
@@ -425,7 +406,7 @@ struct RootView: View {
     private var engineStateView: some View {
         switch engine.state {
         case .idle:
-            Button("Load Model") { showModelManagement = true }
+            Button("Load Model") { showModelPicker = true }
                 .buttonStyle(.borderedProminent)
                 .tint(SwiftBuddyTheme.accent)
                 .controlSize(.small)
diff --git a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
index 9f6f5e7b..f29f5abf 100644
--- a/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
+++ b/SwiftBuddy/SwiftBuddy/Views/SettingsView.swift
@@ -1,5 +1,4 @@
-// SettingsView.swift — Full SwiftLM parameter dashboard + console log (iOS tab or macOS sheet)
-import Observation
+// SettingsView.swift — Inference + appearance settings (iOS tab or macOS sheet)
 import SwiftUI
 #if canImport(MLXInferenceCore)
 import MLXInferenceCore
@@ -8,19 +7,11 @@ import MLXInferenceCore
 struct SettingsView: View {
     @ObservedObject var viewModel: ChatViewModel
     @EnvironmentObject private var appearance: AppearanceStore
-    @EnvironmentObject private var engine: InferenceEngine
-    @EnvironmentObject private var server: ServerManager
     @Environment(\.dismiss) private var dismiss
 
     /// When true, the view is embedded as a tab (no Done button on iOS)
     var isTab: Bool = false
 
-    @State private var selectedTab: SettingsTab = .generation
-    @State private var draftServerConfiguration = ServerStartupConfiguration.load()
-    @State private var showRestartNotification = false
-    @State private var serverSaveMessage = "Server settings saved"
-    @State private var restartNotificationRequiresAction = false
-
     // iOS-specific: performance mode toggle (read from UserDefaults)
     @AppStorage("swiftlm.performanceMode") private var performanceMode: Bool = false
 
@@ -28,474 +19,156 @@ struct SettingsView: View {
         Double(ProcessInfo.processInfo.physicalMemory) / (1024 * 1024 * 1024)
     }
 
-    enum SettingsTab: String, CaseIterable {
-        case generation = "Generation"
-        case engine = "Engine"
-        case console = "Console"
-        case about = "About"
-
-        var icon: String {
-            switch self {
-            case .generation: return "slider.horizontal.3"
-            case .engine:     return "cpu"
-            case .console:    return "terminal"
-            case .about:      return "info.circle"
-            }
-        }
-    }
-
     var body: some View {
-        NavigationStack {
-            ZStack {
-                SwiftBuddyTheme.background.ignoresSafeArea()
+        ZStack {
+            SwiftBuddyTheme.background.ignoresSafeArea()
 
-                VStack(spacing: 0) {
-                    // ── Tab Picker ──────────────────────────────────────────
-                    tabPicker
-                        .padding(.horizontal, 16)
-                        .padding(.top, 12)
-                        .padding(.bottom, 8)
-
-                    // ── Tab Content ─────────────────────────────────────────
-                    switch selectedTab {
-                    case .generation:
-                        generationTab
-                    case .engine:
-                        engineTab
-                    case .console:
-                        consoleTab
-                    case .about:
-                        aboutTab
+            Form {
+                // ── System Engine ─────────────────────────────────────────────
+                Section {
+                    Button {
+                        NotificationCenter.default.post(name: .showModelPicker, object: nil)
+                        dismiss()
+                    } label: {
+                        HStack {
+                            Label("Model Configuration", systemImage: "cpu.fill")
+                                .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                            Spacer()
+                            Image(systemName: "chevron.right")
+                                .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                        }
                     }
-                }
-            }
-            .navigationTitle("SwiftLM Settings")
-            #if os(iOS)
-            .navigationBarTitleDisplayMode(isTab ? .large : .inline)
-            .toolbarBackground(SwiftBuddyTheme.background.opacity(0.90), for: .navigationBar)
-            .toolbarBackground(.visible, for: .navigationBar)
-            #endif
-            .toolbar {
-                if !isTab {
-                    ToolbarItem(placement: .confirmationAction) {
-                        Button("Done") { dismiss() }
-                            .foregroundStyle(SwiftBuddyTheme.accent)
+                    .buttonStyle(.plain)
+
+                    Button {
+                        NotificationCenter.default.post(name: .showTextIngestion, object: nil)
+                        dismiss()
+                    } label: {
+                        HStack {
+                            Label("Text Ingestion Miner", systemImage: "hammer.fill")
+                                .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                            Spacer()
+                            Image(systemName: "chevron.right")
+                                .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                        }
                     }
-                }
-            }
-            .overlay(alignment: .top) {
-                if showRestartNotification {
-                    restartNotificationBanner
-                        .padding(.horizontal, 16)
-                        .padding(.top, 12)
-                        .transition(.move(edge: .top).combined(with: .opacity))
-                }
-            }
-            .onAppear {
-                draftServerConfiguration = server.startupConfiguration
-            }
-            #if os(macOS)
-            .frame(minWidth: 520, minHeight: 580)
-            #endif
-        }
-    }
+                    .buttonStyle(.plain)
 
-    // MARK: — Tab Picker
+                    Button {
+                        NotificationCenter.default.post(name: .showModelManagement, object: nil)
+                        dismiss()
+                    } label: {
+                        HStack {
+                            Label("Manage Downloaded Models", systemImage: "externaldrive.badge.minus")
+                                .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                            Spacer()
+                            Image(systemName: "chevron.right")
+                                .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                        }
+                    }
+                    .buttonStyle(.plain)
 
-    private var tabPicker: some View {
-        HStack(spacing: 0) {
-            ForEach(SettingsTab.allCases, id: \.self) { tab in
-                Button {
-                    withAnimation(.easeInOut(duration: 0.2)) { selectedTab = tab }
-                } label: {
-                    VStack(spacing: 4) {
-                        Image(systemName: tab.icon)
-                            .font(.system(size: 16, weight: .medium))
-                        Text(tab.rawValue)
-                            .font(.caption2.weight(.semibold))
+                    Button {
+                        NotificationCenter.default.post(name: .showPersonaDiscovery, object: nil)
+                        dismiss()
+                    } label: {
+                        HStack {
+                            Label("Discover Personas", systemImage: "person.crop.circle.badge.plus")
+                                .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                            Spacer()
+                            Image(systemName: "chevron.right")
+                                .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                        }
+                    }
+                    .buttonStyle(.plain)
+                    
+                    HStack(spacing: 6) {
+                        Label("API Server", systemImage: "network")
+                            .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                        Spacer()
+                        Circle()
+                            .fill(Color.green)
+                            .frame(width: 8, height: 8)
+                        Text("Port 8080")
+                            .font(.caption)
+                            .foregroundStyle(.green)
                     }
-                    .frame(maxWidth: .infinity)
-                    .padding(.vertical, 10)
-                    .background(
-                        selectedTab == tab
-                            ? SwiftBuddyTheme.accent.opacity(0.15)
-                            : Color.clear
-                    )
-                    .foregroundStyle(
-                        selectedTab == tab
-                            ? SwiftBuddyTheme.accent
-                            : SwiftBuddyTheme.textTertiary
-                    )
-                    .clipShape(RoundedRectangle(cornerRadius: 10))
+                    .padding(.vertical, 4)
+                } header: {
+                    sectionLabel("System Engine", icon: "server.rack")
                 }
-                .buttonStyle(.plain)
-            }
-        }
-        .padding(4)
-        .background(SwiftBuddyTheme.surface.opacity(0.5))
-        .clipShape(RoundedRectangle(cornerRadius: 14))
-        .overlay(
-            RoundedRectangle(cornerRadius: 14)
-                .strokeBorder(Color.white.opacity(0.06), lineWidth: 1)
-        )
-    }
-
-    // MARK: — Generation Tab
 
-    private var generationTab: some View {
-        ScrollView {
-            VStack(spacing: 16) {
-                parameterCard("Sampling") {
-                    sliderRow(
-                        label: "Temperature", icon: "thermometer.medium",
-                        value: Binding(
-                            get: { Double(viewModel.config.temperature) },
-                            set: { viewModel.config.temperature = Float($0) }
-                        ),
-                        range: 0...2, step: 0.05, format: "%.2f",
-                        tint: SwiftBuddyTheme.warning,
-                        hint: "Higher = more creative, lower = more focused"
-                    )
-                    sliderRow(
-                        label: "Top P", icon: "chart.bar.xaxis",
-                        value: Binding(
-                            get: { Double(viewModel.config.topP) },
-                            set: { viewModel.config.topP = Float($0) }
-                        ),
-                        range: 0...1, step: 0.05, format: "%.2f",
-                        tint: SwiftBuddyTheme.accentSecondary,
-                        hint: "Nucleus sampling: cumulative probability threshold"
-                    )
-                    sliderRow(
-                        label: "Top K", icon: "line.3.horizontal.decrease.circle",
-                        value: Binding(
-                            get: { Double(viewModel.config.topK) },
-                            set: { viewModel.config.topK = Int($0) }
-                        ),
-                        range: 1...200, step: 1, format: "%.0f",
-                        tint: SwiftBuddyTheme.accent,
-                        hint: "Limit sampling to top K candidates"
-                    )
-                    sliderRow(
-                        label: "Min P", icon: "arrow.down.to.line",
-                        value: Binding(
-                            get: { Double(viewModel.config.minP) },
-                            set: { viewModel.config.minP = Float($0) }
-                        ),
-                        range: 0...0.5, step: 0.01, format: "%.2f",
-                        tint: SwiftBuddyTheme.success,
-                        hint: "Minimum probability filter (0 = disabled)"
-                    )
+                // ── Generation ────────────────────────────────────────────────
+                Section {
+                    temperatureRow
+                    maxTokensRow
+                    topPRow
+                    repetitionPenaltyRow
+                } header: {
+                    sectionLabel("Generation", icon: "slider.horizontal.3")
                 }
 
-                parameterCard("Output") {
-                    sliderRow(
-                        label: "Max Tokens", icon: "text.word.spacing",
-                        value: Binding(
-                            get: { Double(viewModel.config.maxTokens) },
-                            set: { viewModel.config.maxTokens = Int($0) }
-                        ),
-                        range: 128...max(16384.0, Double(engine.maxContextWindow)), step: 128, format: "%.0f",
-                        tint: SwiftBuddyTheme.accent
-                    )
-                    sliderRow(
-                        label: "Repetition Penalty", icon: "repeat.circle",
-                        value: Binding(
-                            get: { Double(viewModel.config.repetitionPenalty) },
-                            set: { viewModel.config.repetitionPenalty = Float($0) }
-                        ),
-                        range: 1.0...2.0, step: 0.01, format: "%.2f",
-                        tint: SwiftBuddyTheme.success,
-                        hint: "Higher = less repeating, 1.0 = disabled"
-                    )
+                // ── Advanced ──────────────────────────────────────────────────
+                Section {
+                    thinkingToggle
+                } header: {
+                    sectionLabel("Advanced", icon: "gearshape.2")
                 }
 
-                parameterCard("Reasoning") {
-                    toggleRow(
-                        label: "Thinking Mode", icon: "brain.head.profile",
-                        isOn: $viewModel.config.enableThinking,
-                        tint: SwiftBuddyTheme.accentSecondary,
-                        hint: "Step-by-step reasoning for Qwen3.5, DeepSeek-R1"
-                    )
+                // ── Appearance ────────────────────────────────────────────────
+                Section {
+                    appearancePicker
+                } header: {
+                    sectionLabel("Appearance", icon: "paintpalette")
                 }
 
-                parameterCard("System Prompt") {
+                // ── System Prompt ─────────────────────────────────────────────
+                Section {
                     TextEditor(text: $viewModel.systemPrompt)
-                        .frame(minHeight: 80)
-                        .font(.callout.monospaced())
+                        .frame(minHeight: 88)
+                        .font(.callout)
                         .foregroundStyle(SwiftBuddyTheme.textPrimary)
                         .scrollContentBackground(.hidden)
-                        .padding(8)
-                        .background(SwiftBuddyTheme.background.opacity(0.5))
-                        .clipShape(RoundedRectangle(cornerRadius: 8))
+                        .background(Color.clear)
+                } header: {
+                    sectionLabel("System Prompt", icon: "text.bubble")
+                } footer: {
                     Text("Injected as the system message before every conversation.")
-                        .font(.caption2)
+                        .font(.caption)
                         .foregroundStyle(SwiftBuddyTheme.textTertiary)
                 }
 
-                // Reset button
-                Button(role: .destructive) {
-                    viewModel.config = .default
-                    viewModel.systemPrompt = ""
-                } label: {
-                    HStack {
-                        Image(systemName: "arrow.counterclockwise")
-                        Text("Reset to Defaults")
-                    }
-                    .font(.callout.weight(.medium))
-                    .foregroundStyle(SwiftBuddyTheme.error)
-                    .frame(maxWidth: .infinity)
-                    .padding(.vertical, 12)
-                    .background(SwiftBuddyTheme.error.opacity(0.1))
-                    .clipShape(RoundedRectangle(cornerRadius: 12))
-                }
-                .buttonStyle(.plain)
-                .padding(.horizontal, 16)
-
-                Spacer(minLength: 20)
-            }
-            .padding(.top, 8)
-        }
-    }
-
-    // MARK: — Engine Tab
-
-    private var engineTab: some View {
-        ScrollView {
-            VStack(spacing: 16) {
-                parameterCard("Local API Server") {
-                    HStack {
-                        Label(server.isOnline ? "Online" : "Offline", systemImage: "network")
-                            .foregroundStyle(server.isOnline ? SwiftBuddyTheme.success : SwiftBuddyTheme.textSecondary)
-                            .font(.callout.weight(.medium))
-                        Spacer()
-                        Text("\(server.host):\(server.port)")
-                            .foregroundStyle(SwiftBuddyTheme.textSecondary)
-                            .font(.callout.monospacedDigit())
-                    }
-
-                    toggleRow(
-                        label: "Start Server on Launch", icon: "power.circle",
-                        isOn: $draftServerConfiguration.autoStart,
-                        tint: SwiftBuddyTheme.success
-                    )
-
-                    textFieldRow(
-                        label: "Host", icon: "network",
-                        text: $draftServerConfiguration.host,
-                        placeholder: "127.0.0.1"
-                    )
-
-                    stepperRow(
-                        label: "Port", icon: "number.circle",
-                        value: $draftServerConfiguration.port,
-                        range: 1...65_535
-                    )
-
-                    stepperRow(
-                        label: "Parallel Slots", icon: "square.stack.3d.up",
-                        value: $draftServerConfiguration.parallelSlots,
-                        range: 1...16
-                    )
-
-                    textFieldRow(
-                        label: "CORS Origin", icon: "globe",
-                        text: $draftServerConfiguration.corsOrigin,
-                        placeholder: "Optional"
-                    )
-
-                    secureFieldRow(
-                        label: "API Key", icon: "key",
-                        text: $draftServerConfiguration.apiKey,
-                        placeholder: "Optional"
-                    )
-
-                    HStack(spacing: 10) {
-                        Button {
-                            saveServerConfiguration()
-                        } label: {
-                            Label("Save Server Settings", systemImage: "square.and.arrow.down")
-                                .frame(maxWidth: .infinity)
-                        }
-                        .buttonStyle(.borderedProminent)
-                        .tint(SwiftBuddyTheme.accent)
-                        .disabled(draftServerConfiguration.normalized == server.startupConfiguration)
-
-                        if server.restartRequired {
-                            Button {
-                                restartServer()
-                            } label: {
-                                Label("Restart", systemImage: "arrow.clockwise")
-                            }
-                            .buttonStyle(.bordered)
-                        }
-                    }
-                }
-
-                parameterCard("KV Cache") {
-                    HStack {
-                        Label("KV Bits", systemImage: "number.circle")
-                            .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                            .font(.callout)
-                        Spacer()
-                        Picker("", selection: Binding(
-                            get: { viewModel.config.kvBits ?? 0 },
-                            set: { viewModel.config.kvBits = $0 == 0 ? nil : $0 }
-                        )) {
-                            Text("Off").tag(0)
-                            Text("4-bit").tag(4)
-                            Text("8-bit").tag(8)
-                        }
-                        .pickerStyle(.segmented)
-                        .frame(width: 200)
-                    }
-                    .padding(.vertical, 2)
-
-                    sliderRow(
-                        label: "KV Group Size", icon: "square.3.layers.3d",
-                        value: Binding(
-                            get: { Double(viewModel.config.kvGroupSize) },
-                            set: { viewModel.config.kvGroupSize = Int($0) }
-                        ),
-                        range: 32...128, step: 32, format: "%.0f",
-                        tint: SwiftBuddyTheme.warning,
-                        hint: "Applies to quantized KV cache during generation"
-                    )
-                }
-
-                parameterCard("Prefill") {
-                    sliderRow(
-                        label: "Prefill Chunk Size", icon: "rectangle.split.3x3",
-                        value: Binding(
-                            get: { Double(viewModel.config.prefillSize) },
-                            set: { viewModel.config.prefillSize = Int($0) }
-                        ),
-                        range: 64...2048, step: 64, format: "%.0f",
-                        tint: SwiftBuddyTheme.accentSecondary,
-                        hint: "Lower values prevent GPU timeout on large models"
-                    )
-                }
-
-                parameterCard("Appearance") {
-                    HStack {
-                        Label("Color Scheme", systemImage: "paintpalette")
-                            .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                            .font(.callout)
-                        Spacer()
-                    }
-                    Picker("", selection: $appearance.preference) {
-                        HStack { Image(systemName: "moon.fill"); Text("Dark") }.tag("dark")
-                        HStack { Image(systemName: "sun.max.fill"); Text("Light") }.tag("light")
-                        HStack { Image(systemName: "circle.lefthalf.filled"); Text("System") }.tag("system")
-                    }
-                    .pickerStyle(.segmented)
-                    .tint(SwiftBuddyTheme.accent)
-                }
-
+                // ── Performance (iOS-only) ─────────────────────────────────────
                 #if os(iOS)
-                parameterCard("iOS Performance") {
-                    toggleRow(
-                        label: "Performance Mode", icon: "bolt.fill",
-                        isOn: $performanceMode,
-                        tint: SwiftBuddyTheme.accent,
-                        hint: "Use 55% RAM budget (vs. 40%) — enables larger models on \(String(format: "%.0f GB", ramGB)) device"
-                    )
-                    toggleRow(
-                        label: "Auto-Unload in Background", icon: "iphone.slash",
-                        isOn: Binding(
-                            get: { UserDefaults.standard.bool(forKey: "swiftlm.autoOffload") == false
-                                    ? true
-                                    : UserDefaults.standard.bool(forKey: "swiftlm.autoOffload") },
-                            set: { UserDefaults.standard.set($0, forKey: "swiftlm.autoOffload") }
-                        ),
-                        tint: SwiftBuddyTheme.success,
-                        hint: "Frees GPU memory when the app backgrounds"
-                    )
-                }
-                #endif
-
-                Spacer(minLength: 20)
-            }
-            .padding(.top, 8)
-        }
-    }
-
-    // MARK: — Console Tab
-
-    private var consoleTab: some View {
-        VStack(spacing: 0) {
-            // Console header
-            HStack {
-                Image(systemName: "terminal")
-                    .foregroundStyle(SwiftBuddyTheme.accent)
-                Text("SwiftLM Console")
-                    .font(.callout.weight(.semibold))
-                    .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                Spacer()
-                Button {
-                    ConsoleLog.shared.clear()
-                } label: {
-                    Image(systemName: "trash")
+                Section {
+                    performanceModeRow
+                    autoOffloadRow
+                } header: {
+                    sectionLabel("Performance", icon: "cpu")
+                } footer: {
+                    Text("Performance Mode loosens the RAM budget from 40% to 55%, allowing larger models on your \(String(format: "%.0f GB", ramGB)) device.")
                         .font(.caption)
                         .foregroundStyle(SwiftBuddyTheme.textTertiary)
                 }
-                .buttonStyle(.plain)
-            }
-            .padding(.horizontal, 16)
-            .padding(.vertical, 10)
-
-            Divider().background(SwiftBuddyTheme.divider)
+                #endif
 
-            // Console output
-            ScrollViewReader { proxy in
-                ScrollView {
-                    LazyVStack(alignment: .leading, spacing: 2) {
-                        ForEach(ConsoleLog.shared.entries) { entry in
-                            consoleEntryRow(entry)
-                                .id(entry.id)
-                        }
-                    }
-                    .padding(.horizontal, 12)
-                    .padding(.vertical, 8)
-                }
-                .onChange(of: ConsoleLog.shared.entries.count) {
-                    if let last = ConsoleLog.shared.entries.last {
-                        withAnimation {
-                            proxy.scrollTo(last.id, anchor: .bottom)
+                // ── Reset ─────────────────────────────────────────────────────
+                Section {
+                    Button(role: .destructive) {
+                        viewModel.config = .default
+                        viewModel.systemPrompt = ""
+                    } label: {
+                        HStack {
+                            Spacer()
+                            Label("Reset to Defaults", systemImage: "arrow.counterclockwise")
+                                .foregroundStyle(SwiftBuddyTheme.error)
+                            Spacer()
                         }
                     }
                 }
-            }
-            .background(Color.black.opacity(0.3))
-            .clipShape(RoundedRectangle(cornerRadius: 8))
-            .padding(.horizontal, 12)
-            .padding(.bottom, 12)
-        }
-    }
 
-    private func consoleEntryRow(_ entry: ConsoleLog.Entry) -> some View {
-        HStack(alignment: .top, spacing: 6) {
-            Text(entry.timestamp)
-                .font(.system(size: 10, design: .monospaced))
-                .foregroundStyle(SwiftBuddyTheme.textTertiary)
-            Image(systemName: entry.level.icon)
-                .font(.system(size: 9))
-                .foregroundStyle(entry.level.color)
-                .frame(width: 12)
-            Text(entry.message)
-                .font(.system(size: 11, design: .monospaced))
-                .foregroundStyle(entry.level.textColor)
-                .textSelection(.enabled)
-        }
-        .padding(.vertical, 1)
-    }
-
-    // MARK: — About Tab
-
-    private var aboutTab: some View {
-        ScrollView {
-            VStack(spacing: 16) {
-                parameterCard("System Info") {
+                // ── About ─────────────────────────────────────────────────────
+                Section {
                     aboutRow("SwiftBuddy Chat", value: "1.0")
                     aboutRow("Engine", value: "MLX Swift")
                     aboutRow("Backend", value: "Metal GPU")
@@ -507,346 +180,207 @@ struct SettingsView: View {
                         #endif
                     }())
                     aboutRow("RAM", value: String(format: "%.0f GB", ramGB))
+                } header: {
+                    sectionLabel("About", icon: "info.circle")
                 }
-
-                parameterCard("Active Model") {
-                    switch engine.state {
-                    case .ready(let modelId):
-                        aboutRow("Model", value: modelId.components(separatedBy: "/").last ?? modelId)
-                    case .idle:
-                        aboutRow("Model", value: "None loaded")
-                    default:
-                        aboutRow("Model", value: engine.state.shortLabel)
-                    }
-                    if engine.activeContextTokens > 0 {
-                        aboutRow("Context Tokens", value: "\(engine.activeContextTokens)")
-                    }
-                }
-
-                parameterCard("Quick Actions") {
-                    Button {
-                        NotificationCenter.default.post(name: .showModelManagement, object: nil)
-                        dismiss()
-                    } label: {
-                        quickActionRow("Model Configuration", icon: "cpu.fill")
-                    }
-                    .buttonStyle(.plain)
-
-                    Button {
-                        NotificationCenter.default.post(name: .showTextIngestion, object: nil)
-                        dismiss()
-                    } label: {
-                        quickActionRow("Text Ingestion Miner", icon: "hammer.fill")
-                    }
-                    .buttonStyle(.plain)
-
-                    Button {
-                        NotificationCenter.default.post(name: .showModelManagement, object: nil)
-                        dismiss()
-                    } label: {
-                        quickActionRow("Manage Downloaded Models", icon: "externaldrive.badge.minus")
-                    }
-                    .buttonStyle(.plain)
+            }
+            #if os(macOS)
+            .formStyle(.grouped)
+            #endif
+            .scrollContentBackground(.hidden)
+        }
+        .navigationTitle("Settings")
+        #if os(iOS)
+        .navigationBarTitleDisplayMode(isTab ? .large : .inline)
+        .toolbarBackground(SwiftBuddyTheme.background.opacity(0.90), for: .navigationBar)
+        .toolbarBackground(.visible, for: .navigationBar)
+        #endif
+        .toolbar {
+            if !isTab {
+                ToolbarItem(placement: .confirmationAction) {
+                    Button("Done") { dismiss() }
+                        .foregroundStyle(SwiftBuddyTheme.accent)
                 }
-
-                Spacer(minLength: 20)
             }
-            .padding(.top, 8)
         }
+        #if os(macOS)
+        .frame(width: 440, height: 640)
+        #endif
     }
 
-    // MARK: — Reusable Components
+    // MARK: — Row Helpers
 
-    private var restartNotificationBanner: some View {
-        HStack(spacing: 12) {
-            Image(systemName: "arrow.clockwise.circle.fill")
-                .foregroundStyle(SwiftBuddyTheme.warning)
-            VStack(alignment: .leading, spacing: 2) {
-                Text(serverSaveMessage)
-                    .font(.callout.weight(.semibold))
+    private var temperatureRow: some View {
+        VStack(alignment: .leading, spacing: 4) {
+            HStack {
+                Label("Temperature", systemImage: "thermometer.medium")
                     .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                Text(restartNotificationRequiresAction
-                     ? "Restart the local API server to apply startup changes."
-                     : "Changes will apply the next time the server starts.")
-                    .font(.caption)
+                Spacer()
+                Text(String(format: "%.2f", viewModel.config.temperature))
                     .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                    .monospacedDigit()
+                    .font(.callout)
             }
-            Spacer()
-            if restartNotificationRequiresAction {
-                Button("Restart") {
-                    restartServer()
-                }
-                .buttonStyle(.borderedProminent)
-                .controlSize(.small)
-                .tint(SwiftBuddyTheme.accent)
-            }
-            Button {
-                withAnimation { showRestartNotification = false }
-            } label: {
-                Image(systemName: "xmark")
-                    .font(.caption.weight(.bold))
-            }
-            .buttonStyle(.plain)
-            .foregroundStyle(SwiftBuddyTheme.textTertiary)
+            Slider(value: Binding(
+                get: { Double(viewModel.config.temperature) },
+                set: { viewModel.config.temperature = Float($0) }
+            ), in: 0...2, step: 0.05)
+            .tint(SwiftBuddyTheme.warning)
+            Text("Higher = more creative, lower = more focused")
+                .font(.caption2)
+                .foregroundStyle(SwiftBuddyTheme.textTertiary)
         }
-        .padding(12)
-        .background(SwiftBuddyTheme.surface.opacity(0.96))
-        .clipShape(RoundedRectangle(cornerRadius: 12))
-        .overlay(
-            RoundedRectangle(cornerRadius: 12)
-                .strokeBorder(SwiftBuddyTheme.warning.opacity(0.45), lineWidth: 1)
-        )
-        .shadow(color: .black.opacity(0.18), radius: 14, y: 6)
+        .padding(.vertical, 2)
     }
 
-    private func saveServerConfiguration() {
-        let changed = server.saveStartupConfiguration(draftServerConfiguration)
-        draftServerConfiguration = server.startupConfiguration
-        if server.restartRequired {
-            serverSaveMessage = changed ? "Server settings saved" : "Server restart required"
-            restartNotificationRequiresAction = true
-            withAnimation(.easeInOut(duration: 0.2)) {
-                showRestartNotification = true
-            }
-        } else {
-            serverSaveMessage = "Server settings saved"
-            restartNotificationRequiresAction = false
-            withAnimation(.easeInOut(duration: 0.2)) {
-                showRestartNotification = true
-            }
-            DispatchQueue.main.asyncAfter(deadline: .now() + 2.0) {
-                withAnimation(.easeInOut(duration: 0.2)) {
-                    showRestartNotification = false
-                }
+    private var maxTokensRow: some View {
+        VStack(alignment: .leading, spacing: 4) {
+            HStack {
+                Label("Max Tokens", systemImage: "text.word.spacing")
+                    .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                Spacer()
+                Text("\(viewModel.config.maxTokens)")
+                    .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                    .monospacedDigit()
+                    .font(.callout)
             }
+            Slider(value: Binding(
+                get: { Double(viewModel.config.maxTokens) },
+                set: { viewModel.config.maxTokens = Int($0) }
+            ), in: 128...8192, step: 128)
+            .tint(SwiftBuddyTheme.accent)
         }
+        .padding(.vertical, 2)
     }
 
-    private func restartServer() {
-        server.restart(engine: engine)
-        serverSaveMessage = "Server restarted"
-        withAnimation(.easeInOut(duration: 0.2)) {
-            showRestartNotification = false
-        }
-    }
-
-    @ViewBuilder
-    private func parameterCard<Content: View>(_ title: String, @ViewBuilder content: () -> Content) -> some View {
-        VStack(alignment: .leading, spacing: 10) {
-            Text(title.uppercased())
-                .font(.caption.weight(.bold))
-                .foregroundStyle(SwiftBuddyTheme.textTertiary)
-                .tracking(0.8)
-
-            VStack(alignment: .leading, spacing: 12) {
-                content()
+    private var topPRow: some View {
+        VStack(alignment: .leading, spacing: 4) {
+            HStack {
+                Label("Top P", systemImage: "chart.bar.xaxis")
+                    .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                Spacer()
+                Text(String(format: "%.2f", viewModel.config.topP))
+                    .foregroundStyle(SwiftBuddyTheme.textSecondary)
+                    .monospacedDigit()
+                    .font(.callout)
             }
-            .padding(14)
-            .background(SwiftBuddyTheme.surface.opacity(0.5))
-            .clipShape(RoundedRectangle(cornerRadius: 14))
-            .overlay(
-                RoundedRectangle(cornerRadius: 14)
-                    .strokeBorder(Color.white.opacity(0.06), lineWidth: 1)
-            )
+            Slider(value: Binding(
+                get: { Double(viewModel.config.topP) },
+                set: { viewModel.config.topP = Float($0) }
+            ), in: 0...1, step: 0.05)
+            .tint(SwiftBuddyTheme.accentSecondary)
         }
-        .padding(.horizontal, 16)
+        .padding(.vertical, 2)
     }
 
-    private func sliderRow(
-        label: String, icon: String,
-        value: Binding<Double>,
-        range: ClosedRange<Double>, step: Double, format: String,
-        tint: Color, hint: String? = nil
-    ) -> some View {
+    private var repetitionPenaltyRow: some View {
         VStack(alignment: .leading, spacing: 4) {
             HStack {
-                Label(label, systemImage: icon)
+                Label("Repetition Penalty", systemImage: "repeat.circle")
                     .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                    .font(.callout)
                 Spacer()
-                Text(String(format: format, value.wrappedValue))
+                Text(String(format: "%.2f", viewModel.config.repetitionPenalty))
                     .foregroundStyle(SwiftBuddyTheme.textSecondary)
                     .monospacedDigit()
-                    .font(.callout.weight(.medium))
-            }
-            Slider(value: value, in: range, step: step)
-                .tint(tint)
-            if let hint = hint {
-                Text(hint)
-                    .font(.caption2)
-                    .foregroundStyle(SwiftBuddyTheme.textTertiary)
+                    .font(.callout)
             }
+            Slider(value: Binding(
+                get: { Double(viewModel.config.repetitionPenalty) },
+                set: { viewModel.config.repetitionPenalty = Float($0) }
+            ), in: 1.0...2.0, step: 0.01)
+            .tint(SwiftBuddyTheme.success)
+            Text("Higher = less repeating, 1.0 = disabled (can cause echoing)")
+                .font(.caption2)
+                .foregroundStyle(SwiftBuddyTheme.textTertiary)
         }
+        .padding(.vertical, 2)
     }
 
-    private func toggleRow(
-        label: String, icon: String,
-        isOn: Binding<Bool>,
-        tint: Color, hint: String? = nil
-    ) -> some View {
-        VStack(alignment: .leading, spacing: 4) {
-            Toggle(isOn: isOn) {
-                VStack(alignment: .leading, spacing: 2) {
-                    Label(label, systemImage: icon)
-                        .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                        .font(.callout)
-                    if let hint = hint {
-                        Text(hint)
-                            .font(.caption2)
-                            .foregroundStyle(SwiftBuddyTheme.textTertiary)
-                    }
-                }
+    private var thinkingToggle: some View {
+        Toggle(isOn: $viewModel.config.enableThinking) {
+            VStack(alignment: .leading, spacing: 2) {
+                Label("Thinking Mode", systemImage: "brain.head.profile")
+                    .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                Text("Step-by-step reasoning for Qwen3, DeepSeek-R1, and compatible models")
+                    .font(.caption)
+                    .foregroundStyle(SwiftBuddyTheme.textTertiary)
             }
-            .tint(tint)
         }
+        .tint(SwiftBuddyTheme.accentSecondary)
     }
 
-    private func textFieldRow(
-        label: String, icon: String,
-        text: Binding<String>, placeholder: String
-    ) -> some View {
-        HStack(spacing: 12) {
-            Label(label, systemImage: icon)
-                .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                .font(.callout)
-            Spacer()
-            TextField(placeholder, text: text)
-                .textFieldStyle(.roundedBorder)
-                .frame(maxWidth: 240)
+    private var appearancePicker: some View {
+        Picker("Color Scheme", selection: $appearance.preference) {
+            HStack {
+                Image(systemName: "moon.fill")
+                Text("Dark")
+            }.tag("dark")
+
+            HStack {
+                Image(systemName: "sun.max.fill")
+                Text("Light")
+            }.tag("light")
+
+            HStack {
+                Image(systemName: "circle.lefthalf.filled")
+                Text("System")
+            }.tag("system")
         }
+        .pickerStyle(.segmented)
+        .tint(SwiftBuddyTheme.accent)
     }
 
-    private func secureFieldRow(
-        label: String, icon: String,
-        text: Binding<String>, placeholder: String
-    ) -> some View {
-        HStack(spacing: 12) {
-            Label(label, systemImage: icon)
-                .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                .font(.callout)
-            Spacer()
-            SecureField(placeholder, text: text)
-                .textFieldStyle(.roundedBorder)
-                .frame(maxWidth: 240)
+    #if os(iOS)
+    private var performanceModeRow: some View {
+        Toggle(isOn: $performanceMode) {
+            VStack(alignment: .leading, spacing: 2) {
+                Label("Performance Mode", systemImage: "bolt.fill")
+                    .foregroundStyle(SwiftBuddyTheme.textPrimary)
+                Text("Use 55% RAM budget (vs. 40%) — enables more models on 6 GB devices")
+                    .font(.caption)
+                    .foregroundStyle(SwiftBuddyTheme.textTertiary)
+            }
         }
+        .tint(SwiftBuddyTheme.accent)
     }
 
-    private func stepperRow(
-        label: String, icon: String,
-        value: Binding<Int>, range: ClosedRange<Int>
-    ) -> some View {
-        Stepper(value: value, in: range) {
-            HStack {
-                Label(label, systemImage: icon)
+    @State private var tempEngine: InferenceEngine? = nil
+
+    private var autoOffloadRow: some View {
+        // We can't easily get the engine here without prop drilling,
+        // so we persist via UserDefaults and InferenceEngine reads it on next launch.
+        Toggle(isOn: Binding(
+            get: { UserDefaults.standard.bool(forKey: "swiftlm.autoOffload") == false
+                    ? true  // default true
+                    : UserDefaults.standard.bool(forKey: "swiftlm.autoOffload") },
+            set: { UserDefaults.standard.set($0, forKey: "swiftlm.autoOffload") }
+        )) {
+            VStack(alignment: .leading, spacing: 2) {
+                Label("Auto-Unload in Background", systemImage: "iphone.slash")
                     .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                    .font(.callout)
-                Spacer()
-                Text("\(value.wrappedValue)")
-                    .foregroundStyle(SwiftBuddyTheme.textSecondary)
-                    .font(.callout.monospacedDigit())
+                Text("Frees GPU memory when the app backgrounds (recommended on iPhone)")
+                    .font(.caption)
+                    .foregroundStyle(SwiftBuddyTheme.textTertiary)
             }
         }
+        .tint(SwiftBuddyTheme.success)
     }
+    #endif
 
     private func aboutRow(_ label: String, value: String) -> some View {
         HStack {
             Text(label)
                 .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                .font(.callout)
             Spacer()
             Text(value)
                 .foregroundStyle(SwiftBuddyTheme.textSecondary)
-                .font(.callout.weight(.medium))
         }
     }
 
-    private func quickActionRow(_ label: String, icon: String) -> some View {
-        HStack {
-            Label(label, systemImage: icon)
-                .foregroundStyle(SwiftBuddyTheme.textPrimary)
-                .font(.callout)
-            Spacer()
-            Image(systemName: "chevron.right")
-                .font(.caption2)
-                .foregroundStyle(SwiftBuddyTheme.textTertiary)
-        }
-        .padding(.vertical, 4)
-    }
-}
-
-// MARK: — Console Log Service
-
-@Observable
-final class ConsoleLog {
-    static let shared = ConsoleLog()
-
-    struct Entry: Identifiable {
-        let id = UUID()
-        let date: Date
-        let level: Level
-        let message: String
-
-        var timestamp: String {
-            Self.formatter.string(from: date)
-        }
-
-        private static let formatter: DateFormatter = {
-            let f = DateFormatter()
-            f.dateFormat = "HH:mm:ss.SSS"
-            return f
-        }()
-
-        enum Level {
-            case info, warning, error, debug
-
-            var icon: String {
-                switch self {
-                case .info:    return "info.circle.fill"
-                case .warning: return "exclamationmark.triangle.fill"
-                case .error:   return "xmark.circle.fill"
-                case .debug:   return "ant.fill"
-                }
-            }
-
-            var color: Color {
-                switch self {
-                case .info:    return .blue
-                case .warning: return .orange
-                case .error:   return .red
-                case .debug:   return .gray
-                }
-            }
-
-            var textColor: Color {
-                switch self {
-                case .info:    return .white.opacity(0.9)
-                case .warning: return .orange.opacity(0.9)
-                case .error:   return .red.opacity(0.9)
-                case .debug:   return .white.opacity(0.5)
-                }
-            }
-        }
-    }
-
-    private(set) var entries: [Entry] = []
-    private let maxEntries = 500
-
-    func log(_ message: String, level: Entry.Level = .info) {
-        let entry = Entry(date: Date(), level: level, message: message)
-        DispatchQueue.main.async {
-            self.entries.append(entry)
-            if self.entries.count > self.maxEntries {
-                self.entries.removeFirst(self.entries.count - self.maxEntries)
-            }
-        }
-    }
-
-    func info(_ message: String)    { log(message, level: .info) }
-    func warning(_ message: String) { log(message, level: .warning) }
-    func error(_ message: String)   { log(message, level: .error) }
-    func debug(_ message: String)   { log(message, level: .debug) }
-
-    func clear() {
-        entries.removeAll()
+    private func sectionLabel(_ title: String, icon: String) -> some View {
+        Label(title, systemImage: icon)
+            .foregroundStyle(SwiftBuddyTheme.textTertiary)
+            .font(.footnote.weight(.semibold))
+            .textCase(.uppercase)
     }
 }
diff --git a/SwiftBuddy/generate_xcodeproj.py b/SwiftBuddy/generate_xcodeproj.py
index 38cc0537..5df445ad 100644
--- a/SwiftBuddy/generate_xcodeproj.py
+++ b/SwiftBuddy/generate_xcodeproj.py
@@ -78,7 +78,6 @@ def uid():
     ("../Sources/MLXInferenceCore/HFModelSearch.swift",        uid(), uid()),
     ("../Sources/MLXInferenceCore/ModelArchitectureProbe.swift", uid(), uid()),
     ("../Sources/MLXInferenceCore/InferenceEngine.swift",      uid(), uid()),
-    ("../Sources/MLXInferenceCore/SSDStreamingRecovery.swift", uid(), uid()),
 ]
 
 all_sources = app_sources + core_sources
@@ -403,18 +402,23 @@ def pbxproj():
 \t\t}};
 /* End XCConfigurationList section */
 
-/* Begin XCLocalSwiftPackageReference section */
-		{PKG_MLX} /* XCLocalSwiftPackageReference "mlx-swift" */ = {{
-			isa = XCLocalSwiftPackageReference;
-			relativePath = ../mlx-swift;
-		}};
-		{PKG_MLXLM} /* XCLocalSwiftPackageReference "mlx-swift-lm" */ = {{
-			isa = XCLocalSwiftPackageReference;
-			relativePath = ../mlx-swift-lm;
-		}};
-/* End XCLocalSwiftPackageReference section */
-
 /* Begin XCRemoteSwiftPackageReference section */
+\t\t{PKG_MLX} /* XCRemoteSwiftPackageReference "mlx-swift" */ = {{
+\t\t\tisa = XCRemoteSwiftPackageReference;
+\t\t\trepositoryURL = "https://github.com/SharpAI/mlx-swift.git";
+\t\t\trequirement = {{
+\t\t\t\tkind = branch;
+\t\t\t\tbranch = main;
+\t\t\t}};
+\t\t}};
+\t\t{PKG_MLXLM} /* XCRemoteSwiftPackageReference "mlx-swift-lm" */ = {{
+\t\t\tisa = XCRemoteSwiftPackageReference;
+\t\t\trepositoryURL = "https://github.com/SharpAI/mlx-swift-lm.git";
+\t\t\trequirement = {{
+\t\t\t\tkind = branch;
+\t\t\t\tbranch = main;
+\t\t\t}};
+\t\t}};
 \t\t{PKG_TRANSFORMERS} /* XCRemoteSwiftPackageReference "swift-transformers" */ = {{
 \t\t\tisa = XCRemoteSwiftPackageReference;
 \t\t\trepositoryURL = "https://github.com/huggingface/swift-transformers";
diff --git a/scripts/profiling/profile_runner.py b/scripts/profiling/profile_runner.py
index 13f89e67..8a6e8a83 100755
--- a/scripts/profiling/profile_runner.py
+++ b/scripts/profiling/profile_runner.py
@@ -18,7 +18,7 @@
     {"name": "SSD + 16-Worker Prefetch", "flags": ["--stream-experts", "--ssd-prefetch"]}
 ]
 
-SWIFTLM_PATH = ".build/arm64-apple-macosx/release/SwiftLM"
+SWIFTLM_PATH = ".build/release/swiftlm"
 
 def get_physical_ram_gb():
     try:
@@ -320,10 +320,18 @@ def main():
         
         log_path = "./tmp/profile_server.log"
         os.makedirs(os.path.dirname(log_path), exist_ok=True)
-        cmd = [SWIFTLM_PATH, "--model", model_id, "--port", "5422"] + config["flags"]
+        
+        # Use absolute paths and set CWD to binary directory to prevent "metallib not found" errors
+        abs_swiftlm_path = os.path.abspath(SWIFTLM_PATH)
+        swiftlm_dir = os.path.dirname(abs_swiftlm_path)
+        cmd = [abs_swiftlm_path, "--model", model_id, "--port", "5422"] + config["flags"]
+        
+        # Ensure METAL_LIBRARY_PATH is set so MLX can find its shaders
+        env = os.environ.copy()
+        env["METAL_LIBRARY_PATH"] = swiftlm_dir
         
         with open(log_path, "w") as root_log:
-            server_proc = subprocess.Popen(cmd, stdout=root_log, stderr=subprocess.STDOUT)
+            server_proc = subprocess.Popen(cmd, stdout=root_log, stderr=subprocess.STDOUT, env=env, cwd=swiftlm_dir)
         
         requires_dense_memory = "--stream-experts" not in config["flags"]
         is_healthy, overcommitted = poll_health(

From 9d04a13c1d825b4812f0dca95f4d02c0917ec9d6 Mon Sep 17 00:00:00 2001
From: Roy Peter D'Souza <roy.peter.dsouza@gmail.com>
Date: Thu, 30 Apr 2026 07:38:18 -0700
Subject: [PATCH 2/2] Fix #3: Resolve multimodal BOA/EOA tokens from
 config.json instead of hardcoding

---
 Sources/SwiftLM/Server.swift | 37 ++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
index 2827a24f..fd5d2cd7 100644
--- a/Sources/SwiftLM/Server.swift
+++ b/Sources/SwiftLM/Server.swift
@@ -2972,15 +2972,15 @@ public final class ALMModelFactory: ModelFactory, @unchecked Sendable {
     ) async throws -> ModelContext {
         let context = try await LLMModelFactory.shared._load(configuration: configuration, tokenizerLoader: tokenizerLoader)
         
-        let numAudioEmbeddings = OmniModelFactory.extractNumAudioEmbeddings(configuration: configuration)
+        let tokens = OmniModelFactory.extractMultimodalTokens(configuration: configuration)
         let messageGenerator = DefaultMessageGenerator()
         let processor = ALMUserInputProcessor(
             tokenizer: context.tokenizer,
             configuration: context.configuration,
             messageGenerator: messageGenerator,
-            boaToken: 255010,
-            eoaToken: 255011,
-            numAudioEmbeddings: numAudioEmbeddings
+            boaToken: tokens.boa,
+            eoaToken: tokens.eoa,
+            numAudioEmbeddings: tokens.numAudio
         )
         
         return .init(
@@ -3040,10 +3040,12 @@ public final class OmniModelFactory: ModelFactory, @unchecked Sendable {
         tokenizerLoader: any TokenizerLoader
     ) async throws -> ModelContext {
         let vlmContext = try await VLMModelFactory.shared._load(configuration: configuration, tokenizerLoader: tokenizerLoader)
-        let numAudioEmbeddings = OmniModelFactory.extractNumAudioEmbeddings(configuration: configuration)
+        let tokens = OmniModelFactory.extractMultimodalTokens(configuration: configuration)
         let omniProcessor = OmniUserInputProcessor(
             vlmProcessor: vlmContext.processor,
-            numAudioEmbeddings: numAudioEmbeddings
+            boaToken: tokens.boa,
+            eoaToken: tokens.eoa,
+            numAudioEmbeddings: tokens.numAudio
         )
         
         return .init(
@@ -3054,19 +3056,30 @@ public final class OmniModelFactory: ModelFactory, @unchecked Sendable {
         )
     }
 
-    public static func extractNumAudioEmbeddings(configuration: ResolvedModelConfiguration) -> Int {
+    public static func extractMultimodalTokens(configuration: ResolvedModelConfiguration) -> (numAudio: Int, boa: Int, eoa: Int) {
         let configurationURL = configuration.modelDirectory.appending(component: "config.json")
+        var numAudio = 128
+        var boa = 255010
+        var eoa = 255011
+        
         if let data = try? Data(contentsOf: configurationURL),
            let dict = try? JSONSerialization.jsonObject(with: data) as? [String: Any] {
             
+            // Extract num_audio_embeddings
             if let subsampling = dict["subsampling_conv_channels"] as? [Int] {
-                return subsampling.first ?? 128
-            }
-            if let audioConfig = dict["audio_config"] as? [String: Any],
+                numAudio = subsampling.first ?? 128
+            } else if let audioConfig = dict["audio_config"] as? [String: Any],
                let embeddings = audioConfig["num_audio_embeddings"] as? Int {
-                return embeddings
+                numAudio = embeddings
             }
+            
+            // Extract BOA/EOA tokens
+            if let b = dict["boa_token_id"] as? Int { boa = b }
+            else if let b = (dict["audio_config"] as? [String: Any])?["boa_token_id"] as? Int { boa = b }
+            
+            if let e = dict["eoa_token_id"] as? Int { eoa = e }
+            else if let e = (dict["audio_config"] as? [String: Any])?["eoa_token_id"] as? Int { eoa = e }
         }
-        return 128
+        return (numAudio, boa, eoa)
     }
 }