From 6c722f8fa71248ea753311620f257aa7657fff24 Mon Sep 17 00:00:00 2001
From: Jay Rodge <jayrodge@live.com>
Date: Tue, 9 Jun 2026 15:33:37 -0400
Subject: [PATCH] Add SenseVoice CoreML backend

---
 .../ASR/SenseVoice/SenseVoiceConfig.swift     |  42 +++++
 .../ASR/SenseVoice/SenseVoiceManager.swift    | 152 +++++++++++++++++
 .../ASR/SenseVoice/SenseVoiceModels.swift     | 155 ++++++++++++++++++
 Sources/FluidAudio/ModelNames.swift           |  27 +++
 4 files changed, 376 insertions(+)
 create mode 100644 Sources/FluidAudio/ASR/SenseVoice/SenseVoiceConfig.swift
 create mode 100644 Sources/FluidAudio/ASR/SenseVoice/SenseVoiceManager.swift
 create mode 100644 Sources/FluidAudio/ASR/SenseVoice/SenseVoiceModels.swift
diff --git a/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceConfig.swift b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceConfig.swift
new file mode 100644
index 000000000..9c100d52f
--- /dev/null
+++ b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceConfig.swift
@@ -0,0 +1,42 @@
+import Foundation
+
+/// Configuration constants for the SenseVoiceSmall CoreML pipeline.
+///
+/// SenseVoiceSmall is non-autoregressive: a SANM encoder + single CTC head.
+/// The CoreML export is a 3-stage pipeline: preprocessor, encoder+CTC, then
+/// host-side greedy decode.
+public enum SenseVoiceConfig {
+    /// LFR feature dimension (80-bin fbank * LFR m=7).
+    public static let featureDim = 560
+
+    /// Enumerated encoder sequence-length buckets after LFR framing.
+    public static let buckets = [128, 256, 512, 1024, 1800]
+
+    /// Query tokens prepended by the encoder: language, emotion, event, text norm.
+    public static let numQueryTokens = 4
+
+    /// SenseVoice uses `<unk>` = 0 as the CTC blank.
+    public static let blankId = 0
+
+    /// Auto-detect language.
+    public static let defaultLanguage: Int32 = 0
+
+    /// English language embedding index in FunASR/SenseVoice's `lid_dict`.
+    public static let englishLanguage: Int32 = 4
+
+    /// `woitn`: no inverse text normalization.
+    public static let defaultTextNorm: Int32 = 15
+
+    public static let sampleRate = 16_000
+
+    /// Kaldi feeds waveforms in int16 range; AudioConverter yields [-1, 1].
+    public static let waveformScale: Float = 32_768.0
+
+    /// Largest supported feature length, about 108 seconds of audio.
+    public static var maxFrames: Int { buckets.last ?? 1800 }
+
+    public static func pickBucket(forFrames frames: Int) -> Int {
+        for bucket in buckets where bucket >= frames { return bucket }
+        return buckets.last ?? 1800
+    }
+}
diff --git a/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceManager.swift b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceManager.swift
new file mode 100644
index 000000000..2eaafaa33
--- /dev/null
+++ b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceManager.swift
@@ -0,0 +1,152 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Manager for SenseVoiceSmall transcription.
+///
+/// Pipeline: waveform -> preprocessor -> LFR features -> encoder+CTC logits
+/// -> host greedy CTC decode.
+public actor SenseVoiceManager {
+    private let models: SenseVoiceModels
+    private let language: Int32
+    private let textNorm: Int32
+    private static let logger = AppLogger(category: "SenseVoiceManager")
+
+    public init(
+        models: SenseVoiceModels,
+        language: Int32 = SenseVoiceConfig.defaultLanguage,
+        textNorm: Int32 = SenseVoiceConfig.defaultTextNorm
+    ) {
+        self.models = models
+        self.language = language
+        self.textNorm = textNorm
+    }
+
+    public static func load(
+        precision: SenseVoiceEncoderPrecision = .fp16,
+        language: Int32 = SenseVoiceConfig.defaultLanguage,
+        textNorm: Int32 = SenseVoiceConfig.defaultTextNorm,
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> SenseVoiceManager {
+        let models = try await SenseVoiceModels.downloadAndLoad(
+            precision: precision,
+            progressHandler: progressHandler
+        )
+        return SenseVoiceManager(models: models, language: language, textNorm: textNorm)
+    }
+
+    /// Transcribe a 16 kHz mono audio file.
+    public func transcribe(audioURL: URL) throws -> String {
+        let converter = AudioConverter(sampleRate: Double(SenseVoiceConfig.sampleRate))
+        let samples = try converter.resampleAudioFile(audioURL)
+        return try transcribe(audio: samples)
+    }
+
+    /// Transcribe 16 kHz mono float samples in [-1, 1].
+    public func transcribe(audio: [Float]) throws -> String {
+        let features = try runPreprocessor(audio: audio)
+        let (logits, validFrames) = try runEncoder(features: features)
+        return decode(logits: logits, validFrames: validFrames)
+    }
+
+    private func runPreprocessor(audio: [Float]) throws -> MLMultiArray {
+        let waveform = try MLMultiArray(shape: [1, audio.count as NSNumber], dataType: .float32)
+        let scale = SenseVoiceConfig.waveformScale
+        let pointer = waveform.dataPointer.assumingMemoryBound(to: Float32.self)
+        for index in 0..<audio.count {
+            pointer[index] = audio[index] * scale
+        }
+
+        let input = try MLDictionaryFeatureProvider(dictionary: [
+            "waveform": MLFeatureValue(multiArray: waveform)
+        ])
+        let output = try models.preprocessor.prediction(from: input)
+        guard let features = output.featureValue(for: "features")?.multiArrayValue else {
+            throw ASRError.processingFailed("SenseVoice preprocessor produced no features.")
+        }
+        return features
+    }
+
+    private func runEncoder(features: MLMultiArray) throws -> (MLMultiArray, Int) {
+        let dimension = SenseVoiceConfig.featureDim
+        var frameCount = features.shape[1].intValue
+        if frameCount > SenseVoiceConfig.maxFrames {
+            Self.logger.warning("Audio exceeds max length; truncating \(frameCount) to \(SenseVoiceConfig.maxFrames) frames")
+            frameCount = SenseVoiceConfig.maxFrames
+        }
+        let bucket = SenseVoiceConfig.pickBucket(forFrames: frameCount)
+
+        let speech = try MLMultiArray(
+            shape: [1, bucket as NSNumber, dimension as NSNumber],
+            dataType: .float32
+        )
+        let speechPointer = speech.dataPointer.assumingMemoryBound(to: Float32.self)
+        memset(speechPointer, 0, bucket * dimension * MemoryLayout<Float32>.size)
+
+        let copiedCount = frameCount * dimension
+        if features.dataType == .float32 {
+            memcpy(speechPointer, features.dataPointer, copiedCount * MemoryLayout<Float32>.size)
+        } else {
+            for index in 0..<copiedCount {
+                speechPointer[index] = features[index].floatValue
+            }
+        }
+
+        let lengths = try MLMultiArray(shape: [1], dataType: .int32)
+        lengths[0] = NSNumber(value: frameCount)
+        let languageArray = try MLMultiArray(shape: [1], dataType: .int32)
+        languageArray[0] = NSNumber(value: language)
+        let textNormArray = try MLMultiArray(shape: [1], dataType: .int32)
+        textNormArray[0] = NSNumber(value: textNorm)
+
+        let input = try MLDictionaryFeatureProvider(dictionary: [
+            "speech": MLFeatureValue(multiArray: speech),
+            "speech_lengths": MLFeatureValue(multiArray: lengths),
+            "language": MLFeatureValue(multiArray: languageArray),
+            "textnorm": MLFeatureValue(multiArray: textNormArray),
+        ])
+        let output = try models.encoder.prediction(from: input)
+        guard let logits = output.featureValue(for: "ctc_logits")?.multiArrayValue else {
+            throw ASRError.processingFailed("SenseVoice encoder produced no ctc_logits.")
+        }
+        return (logits, SenseVoiceConfig.numQueryTokens + frameCount)
+    }
+
+    private func decode(logits: MLMultiArray, validFrames: Int) -> String {
+        let vocabularySize = logits.shape[2].intValue
+        let frames = min(validFrames, logits.shape[1].intValue)
+        var ids: [Int] = []
+        var previous = -1
+
+        func appendArgmax(valueAt: (Int) -> Float) {
+            var best = 0
+            var bestValue = valueAt(0)
+            for token in 1..<vocabularySize {
+                let value = valueAt(token)
+                if value > bestValue {
+                    bestValue = value
+                    best = token
+                }
+            }
+            if best != SenseVoiceConfig.blankId && best != previous {
+                ids.append(best)
+            }
+            previous = best
+        }
+
+        if logits.dataType == .float32 {
+            let pointer = logits.dataPointer.assumingMemoryBound(to: Float32.self)
+            for frame in 0..<frames {
+                let base = frame * vocabularySize
+                appendArgmax { pointer[base + $0] }
+            }
+        } else {
+            for frame in 0..<frames {
+                appendArgmax { logits[[0, frame as NSNumber, $0 as NSNumber]].floatValue }
+            }
+        }
+
+        return decodeCtcTokenIds(ids, vocabulary: models.vocabulary)
+            .replacingOccurrences(of: "<\\|[^|]*\\|>", with: "", options: .regularExpression)
+            .trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+}
diff --git a/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceModels.swift b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceModels.swift
new file mode 100644
index 000000000..537247793
--- /dev/null
+++ b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceModels.swift
@@ -0,0 +1,155 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// SenseVoice encoder weight precision. fp16 and int8 run on the Neural Engine;
+/// fp32 is the non-ANE fallback.
+public enum SenseVoiceEncoderPrecision: String, Sendable {
+    case fp16
+    case int8
+    case fp32
+
+    var modelName: String {
+        switch self {
+        case .fp16: return ModelNames.SenseVoice.encoder
+        case .int8: return ModelNames.SenseVoice.encoderInt8
+        case .fp32: return ModelNames.SenseVoice.encoderFp32
+        }
+    }
+
+    var computeUnits: MLComputeUnits {
+        self == .fp32 ? .all : .cpuAndNeuralEngine
+    }
+}
+
+/// Loaded SenseVoiceSmall CoreML models plus vocabulary.
+public struct SenseVoiceModels: Sendable {
+    public let preprocessor: MLModel
+    public let encoder: MLModel
+    public let vocabulary: [Int: String]
+
+    private static let logger = AppLogger(category: "SenseVoiceModels")
+
+    public init(preprocessor: MLModel, encoder: MLModel, vocabulary: [Int: String]) {
+        self.preprocessor = preprocessor
+        self.encoder = encoder
+        self.vocabulary = vocabulary
+    }
+
+    public static func downloadAndLoad(
+        precision: SenseVoiceEncoderPrecision = .fp16,
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> SenseVoiceModels {
+        let directory = try await download(precision: precision, progressHandler: progressHandler)
+        return try load(from: directory, precision: precision)
+    }
+
+    public static func download(
+        precision: SenseVoiceEncoderPrecision = .fp16,
+        force: Bool = false,
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> URL {
+        let modelsRoot = modelsRootDirectory()
+        let targetDir = modelsRoot.appendingPathComponent(Repo.senseVoiceSmall.folderName, isDirectory: true)
+
+        if !force && modelsExist(at: targetDir, precision: precision) {
+            logger.info("SenseVoice models already present at: \(targetDir.path)")
+            return targetDir
+        }
+        if force { try? FileManager.default.removeItem(at: targetDir) }
+
+        logger.info("Downloading SenseVoice models from Hugging Face")
+        try await DownloadUtils.downloadRepo(.senseVoiceSmall, to: modelsRoot, progressHandler: progressHandler)
+        logger.info("Successfully downloaded SenseVoice models")
+        return targetDir
+    }
+
+    public static func modelsExist(
+        at directory: URL,
+        precision: SenseVoiceEncoderPrecision = .fp16
+    ) -> Bool {
+        let required = [
+            ModelNames.SenseVoice.preprocessorFile,
+            precision.modelName + ".mlmodelc",
+            ModelNames.SenseVoice.vocabularyFile,
+        ]
+        return required.allSatisfy {
+            FileManager.default.fileExists(atPath: directory.appendingPathComponent($0).path)
+        }
+    }
+
+    public static func load(
+        from directory: URL,
+        precision: SenseVoiceEncoderPrecision = .fp16
+    ) throws -> SenseVoiceModels {
+        let cpuConfig = MLModelConfiguration()
+        cpuConfig.computeUnits = .cpuOnly
+
+        let encoderConfig = MLModelConfiguration()
+        encoderConfig.computeUnits = precision.computeUnits
+
+        let preprocessor = try loadModel(
+            named: ModelNames.SenseVoice.preprocessor,
+            from: directory,
+            configuration: cpuConfig
+        )
+        let encoder = try loadModel(
+            named: precision.modelName,
+            from: directory,
+            configuration: encoderConfig
+        )
+        let vocabulary = try loadVocabulary(from: directory)
+
+        logger.info("Loaded SenseVoice (encoder: \(precision.rawValue), vocab: \(vocabulary.count))")
+        return SenseVoiceModels(preprocessor: preprocessor, encoder: encoder, vocabulary: vocabulary)
+    }
+
+    private static func loadModel(
+        named name: String,
+        from directory: URL,
+        configuration: MLModelConfiguration
+    ) throws -> MLModel {
+        let compiledPath = directory.appendingPathComponent("\(name).mlmodelc")
+        let packagePath = directory.appendingPathComponent("\(name).mlpackage")
+        let modelURL: URL
+
+        if FileManager.default.fileExists(atPath: compiledPath.path) {
+            modelURL = compiledPath
+        } else if FileManager.default.fileExists(atPath: packagePath.path) {
+            modelURL = try MLModel.compileModel(at: packagePath)
+        } else {
+            throw ASRError.processingFailed("SenseVoice model not found: \(name)")
+        }
+
+        return try MLModel(contentsOf: modelURL, configuration: configuration)
+    }
+
+    private static func loadVocabulary(from directory: URL) throws -> [Int: String] {
+        let path = directory.appendingPathComponent(ModelNames.SenseVoice.vocabularyFile)
+        let data = try Data(contentsOf: path)
+
+        if let array = try? JSONSerialization.jsonObject(with: data) as? [String] {
+            return Dictionary(uniqueKeysWithValues: array.enumerated().map { index, token in (index, token) })
+        }
+
+        if let dict = try? JSONSerialization.jsonObject(with: data) as? [String: String] {
+            return Dictionary(uniqueKeysWithValues: dict.compactMap { key, token in
+                guard let index = Int(key) else { return nil }
+                return (index, token)
+            })
+        }
+
+        throw ASRError.processingFailed("Failed to parse SenseVoice vocab.json.")
+    }
+
+    private static func modelsRootDirectory() -> URL {
+        let fileManager = FileManager.default
+        if let appSupport = fileManager.urls(for: .applicationSupportDirectory, in: .userDomainMask).first {
+            return appSupport
+                .appendingPathComponent("FluidAudio", isDirectory: true)
+                .appendingPathComponent("Models", isDirectory: true)
+        }
+        return fileManager.temporaryDirectory
+            .appendingPathComponent("FluidAudio", isDirectory: true)
+            .appendingPathComponent("Models", isDirectory: true)
+    }
+}
diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
index 1d4d7e9fc..e44f23513 100644
--- a/Sources/FluidAudio/ModelNames.swift
+++ b/Sources/FluidAudio/ModelNames.swift
@@ -7,6 +7,7 @@ public enum Repo: String, CaseIterable {
     case parakeetV2 = "FluidInference/parakeet-tdt-0.6b-v2-coreml"
     case parakeetCtc110m = "FluidInference/parakeet-ctc-110m-coreml"
     case parakeetCtc06b = "FluidInference/parakeet-ctc-0.6b-coreml"
+    case senseVoiceSmall = "FluidInference/sensevoice-small-coreml"
     case parakeetEou160 = "FluidInference/parakeet-realtime-eou-120m-coreml/160ms"
     case parakeetEou320 = "FluidInference/parakeet-realtime-eou-120m-coreml/320ms"
     case parakeetEou1280 = "FluidInference/parakeet-realtime-eou-120m-coreml/1280ms"
@@ -35,6 +36,8 @@ public enum Repo: String, CaseIterable {
             return "parakeet-ctc-110m-coreml"
         case .parakeetCtc06b:
             return "parakeet-ctc-0.6b-coreml"
+        case .senseVoiceSmall:
+            return "sensevoice-small-coreml"
         case .parakeetEou160:
             return "parakeet-realtime-eou-120m-coreml/160ms"
         case .parakeetEou320:
@@ -275,6 +278,28 @@ public enum ModelNames {
         ]
     }
 
+    /// SenseVoiceSmall CoreML model names.
+    public enum SenseVoice {
+        public static let preprocessor = "SenseVoicePreprocessor"
+        public static let encoder = "SenseVoiceSmall"
+        public static let encoderInt8 = "SenseVoiceSmall_int8"
+        public static let encoderFp32 = "SenseVoiceSmall_fp32"
+
+        public static let preprocessorFile = preprocessor + ".mlmodelc"
+        public static let encoderFile = encoder + ".mlmodelc"
+        public static let encoderInt8File = encoderInt8 + ".mlmodelc"
+        public static let encoderFp32File = encoderFp32 + ".mlmodelc"
+        public static let vocabularyFile = "vocab.json"
+
+        public static let requiredModels: Set<String> = [
+            preprocessorFile,
+            encoderFile,
+            encoderInt8File,
+            encoderFp32File,
+            vocabularyFile,
+        ]
+    }
+
     /// Nemotron Speech Streaming 0.6B model names
     /// NVIDIA's streaming FastConformer RNNT with encoder cache
     public enum NemotronStreaming {
@@ -584,6 +609,8 @@ public enum ModelNames {
             return ModelNames.ASR.requiredModelsFused
         case .parakeetCtc110m, .parakeetCtc06b:
             return ModelNames.CTC.requiredModels
+        case .senseVoiceSmall:
+            return ModelNames.SenseVoice.requiredModels
         case .parakeetEou160, .parakeetEou320, .parakeetEou1280:
             return ModelNames.ParakeetEOU.requiredModels
         case .nemotronStreaming1120, .nemotronStreaming560: