From 6c722f8fa71248ea753311620f257aa7657fff24 Mon Sep 17 00:00:00 2001 From: Jay Rodge Date: Tue, 9 Jun 2026 15:33:37 -0400 Subject: [PATCH] Add SenseVoice CoreML backend --- .../ASR/SenseVoice/SenseVoiceConfig.swift | 42 +++++ .../ASR/SenseVoice/SenseVoiceManager.swift | 152 +++++++++++++++++ .../ASR/SenseVoice/SenseVoiceModels.swift | 155 ++++++++++++++++++ Sources/FluidAudio/ModelNames.swift | 27 +++ 4 files changed, 376 insertions(+) create mode 100644 Sources/FluidAudio/ASR/SenseVoice/SenseVoiceConfig.swift create mode 100644 Sources/FluidAudio/ASR/SenseVoice/SenseVoiceManager.swift create mode 100644 Sources/FluidAudio/ASR/SenseVoice/SenseVoiceModels.swift diff --git a/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceConfig.swift b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceConfig.swift new file mode 100644 index 000000000..9c100d52f --- /dev/null +++ b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceConfig.swift @@ -0,0 +1,42 @@ +import Foundation + +/// Configuration constants for the SenseVoiceSmall CoreML pipeline. +/// +/// SenseVoiceSmall is non-autoregressive: a SANM encoder + single CTC head. +/// The CoreML export is a 3-stage pipeline: preprocessor, encoder+CTC, then +/// host-side greedy decode. +public enum SenseVoiceConfig { + /// LFR feature dimension (80-bin fbank * LFR m=7). + public static let featureDim = 560 + + /// Enumerated encoder sequence-length buckets after LFR framing. + public static let buckets = [128, 256, 512, 1024, 1800] + + /// Query tokens prepended by the encoder: language, emotion, event, text norm. + public static let numQueryTokens = 4 + + /// SenseVoice uses `` = 0 as the CTC blank. + public static let blankId = 0 + + /// Auto-detect language. + public static let defaultLanguage: Int32 = 0 + + /// English language embedding index in FunASR/SenseVoice's `lid_dict`. + public static let englishLanguage: Int32 = 4 + + /// `woitn`: no inverse text normalization. + public static let defaultTextNorm: Int32 = 15 + + public static let sampleRate = 16_000 + + /// Kaldi feeds waveforms in int16 range; AudioConverter yields [-1, 1]. + public static let waveformScale: Float = 32_768.0 + + /// Largest supported feature length, about 108 seconds of audio. + public static var maxFrames: Int { buckets.last ?? 1800 } + + public static func pickBucket(forFrames frames: Int) -> Int { + for bucket in buckets where bucket >= frames { return bucket } + return buckets.last ?? 1800 + } +} diff --git a/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceManager.swift b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceManager.swift new file mode 100644 index 000000000..2eaafaa33 --- /dev/null +++ b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceManager.swift @@ -0,0 +1,152 @@ +@preconcurrency import CoreML +import Foundation + +/// Manager for SenseVoiceSmall transcription. +/// +/// Pipeline: waveform -> preprocessor -> LFR features -> encoder+CTC logits +/// -> host greedy CTC decode. +public actor SenseVoiceManager { + private let models: SenseVoiceModels + private let language: Int32 + private let textNorm: Int32 + private static let logger = AppLogger(category: "SenseVoiceManager") + + public init( + models: SenseVoiceModels, + language: Int32 = SenseVoiceConfig.defaultLanguage, + textNorm: Int32 = SenseVoiceConfig.defaultTextNorm + ) { + self.models = models + self.language = language + self.textNorm = textNorm + } + + public static func load( + precision: SenseVoiceEncoderPrecision = .fp16, + language: Int32 = SenseVoiceConfig.defaultLanguage, + textNorm: Int32 = SenseVoiceConfig.defaultTextNorm, + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> SenseVoiceManager { + let models = try await SenseVoiceModels.downloadAndLoad( + precision: precision, + progressHandler: progressHandler + ) + return SenseVoiceManager(models: models, language: language, textNorm: textNorm) + } + + /// Transcribe a 16 kHz mono audio file. + public func transcribe(audioURL: URL) throws -> String { + let converter = AudioConverter(sampleRate: Double(SenseVoiceConfig.sampleRate)) + let samples = try converter.resampleAudioFile(audioURL) + return try transcribe(audio: samples) + } + + /// Transcribe 16 kHz mono float samples in [-1, 1]. + public func transcribe(audio: [Float]) throws -> String { + let features = try runPreprocessor(audio: audio) + let (logits, validFrames) = try runEncoder(features: features) + return decode(logits: logits, validFrames: validFrames) + } + + private func runPreprocessor(audio: [Float]) throws -> MLMultiArray { + let waveform = try MLMultiArray(shape: [1, audio.count as NSNumber], dataType: .float32) + let scale = SenseVoiceConfig.waveformScale + let pointer = waveform.dataPointer.assumingMemoryBound(to: Float32.self) + for index in 0.. (MLMultiArray, Int) { + let dimension = SenseVoiceConfig.featureDim + var frameCount = features.shape[1].intValue + if frameCount > SenseVoiceConfig.maxFrames { + Self.logger.warning("Audio exceeds max length; truncating \(frameCount) to \(SenseVoiceConfig.maxFrames) frames") + frameCount = SenseVoiceConfig.maxFrames + } + let bucket = SenseVoiceConfig.pickBucket(forFrames: frameCount) + + let speech = try MLMultiArray( + shape: [1, bucket as NSNumber, dimension as NSNumber], + dataType: .float32 + ) + let speechPointer = speech.dataPointer.assumingMemoryBound(to: Float32.self) + memset(speechPointer, 0, bucket * dimension * MemoryLayout.size) + + let copiedCount = frameCount * dimension + if features.dataType == .float32 { + memcpy(speechPointer, features.dataPointer, copiedCount * MemoryLayout.size) + } else { + for index in 0.. String { + let vocabularySize = logits.shape[2].intValue + let frames = min(validFrames, logits.shape[1].intValue) + var ids: [Int] = [] + var previous = -1 + + func appendArgmax(valueAt: (Int) -> Float) { + var best = 0 + var bestValue = valueAt(0) + for token in 1.. bestValue { + bestValue = value + best = token + } + } + if best != SenseVoiceConfig.blankId && best != previous { + ids.append(best) + } + previous = best + } + + if logits.dataType == .float32 { + let pointer = logits.dataPointer.assumingMemoryBound(to: Float32.self) + for frame in 0..", with: "", options: .regularExpression) + .trimmingCharacters(in: .whitespacesAndNewlines) + } +} diff --git a/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceModels.swift b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceModels.swift new file mode 100644 index 000000000..537247793 --- /dev/null +++ b/Sources/FluidAudio/ASR/SenseVoice/SenseVoiceModels.swift @@ -0,0 +1,155 @@ +@preconcurrency import CoreML +import Foundation + +/// SenseVoice encoder weight precision. fp16 and int8 run on the Neural Engine; +/// fp32 is the non-ANE fallback. +public enum SenseVoiceEncoderPrecision: String, Sendable { + case fp16 + case int8 + case fp32 + + var modelName: String { + switch self { + case .fp16: return ModelNames.SenseVoice.encoder + case .int8: return ModelNames.SenseVoice.encoderInt8 + case .fp32: return ModelNames.SenseVoice.encoderFp32 + } + } + + var computeUnits: MLComputeUnits { + self == .fp32 ? .all : .cpuAndNeuralEngine + } +} + +/// Loaded SenseVoiceSmall CoreML models plus vocabulary. +public struct SenseVoiceModels: Sendable { + public let preprocessor: MLModel + public let encoder: MLModel + public let vocabulary: [Int: String] + + private static let logger = AppLogger(category: "SenseVoiceModels") + + public init(preprocessor: MLModel, encoder: MLModel, vocabulary: [Int: String]) { + self.preprocessor = preprocessor + self.encoder = encoder + self.vocabulary = vocabulary + } + + public static func downloadAndLoad( + precision: SenseVoiceEncoderPrecision = .fp16, + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> SenseVoiceModels { + let directory = try await download(precision: precision, progressHandler: progressHandler) + return try load(from: directory, precision: precision) + } + + public static func download( + precision: SenseVoiceEncoderPrecision = .fp16, + force: Bool = false, + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> URL { + let modelsRoot = modelsRootDirectory() + let targetDir = modelsRoot.appendingPathComponent(Repo.senseVoiceSmall.folderName, isDirectory: true) + + if !force && modelsExist(at: targetDir, precision: precision) { + logger.info("SenseVoice models already present at: \(targetDir.path)") + return targetDir + } + if force { try? FileManager.default.removeItem(at: targetDir) } + + logger.info("Downloading SenseVoice models from Hugging Face") + try await DownloadUtils.downloadRepo(.senseVoiceSmall, to: modelsRoot, progressHandler: progressHandler) + logger.info("Successfully downloaded SenseVoice models") + return targetDir + } + + public static func modelsExist( + at directory: URL, + precision: SenseVoiceEncoderPrecision = .fp16 + ) -> Bool { + let required = [ + ModelNames.SenseVoice.preprocessorFile, + precision.modelName + ".mlmodelc", + ModelNames.SenseVoice.vocabularyFile, + ] + return required.allSatisfy { + FileManager.default.fileExists(atPath: directory.appendingPathComponent($0).path) + } + } + + public static func load( + from directory: URL, + precision: SenseVoiceEncoderPrecision = .fp16 + ) throws -> SenseVoiceModels { + let cpuConfig = MLModelConfiguration() + cpuConfig.computeUnits = .cpuOnly + + let encoderConfig = MLModelConfiguration() + encoderConfig.computeUnits = precision.computeUnits + + let preprocessor = try loadModel( + named: ModelNames.SenseVoice.preprocessor, + from: directory, + configuration: cpuConfig + ) + let encoder = try loadModel( + named: precision.modelName, + from: directory, + configuration: encoderConfig + ) + let vocabulary = try loadVocabulary(from: directory) + + logger.info("Loaded SenseVoice (encoder: \(precision.rawValue), vocab: \(vocabulary.count))") + return SenseVoiceModels(preprocessor: preprocessor, encoder: encoder, vocabulary: vocabulary) + } + + private static func loadModel( + named name: String, + from directory: URL, + configuration: MLModelConfiguration + ) throws -> MLModel { + let compiledPath = directory.appendingPathComponent("\(name).mlmodelc") + let packagePath = directory.appendingPathComponent("\(name).mlpackage") + let modelURL: URL + + if FileManager.default.fileExists(atPath: compiledPath.path) { + modelURL = compiledPath + } else if FileManager.default.fileExists(atPath: packagePath.path) { + modelURL = try MLModel.compileModel(at: packagePath) + } else { + throw ASRError.processingFailed("SenseVoice model not found: \(name)") + } + + return try MLModel(contentsOf: modelURL, configuration: configuration) + } + + private static func loadVocabulary(from directory: URL) throws -> [Int: String] { + let path = directory.appendingPathComponent(ModelNames.SenseVoice.vocabularyFile) + let data = try Data(contentsOf: path) + + if let array = try? JSONSerialization.jsonObject(with: data) as? [String] { + return Dictionary(uniqueKeysWithValues: array.enumerated().map { index, token in (index, token) }) + } + + if let dict = try? JSONSerialization.jsonObject(with: data) as? [String: String] { + return Dictionary(uniqueKeysWithValues: dict.compactMap { key, token in + guard let index = Int(key) else { return nil } + return (index, token) + }) + } + + throw ASRError.processingFailed("Failed to parse SenseVoice vocab.json.") + } + + private static func modelsRootDirectory() -> URL { + let fileManager = FileManager.default + if let appSupport = fileManager.urls(for: .applicationSupportDirectory, in: .userDomainMask).first { + return appSupport + .appendingPathComponent("FluidAudio", isDirectory: true) + .appendingPathComponent("Models", isDirectory: true) + } + return fileManager.temporaryDirectory + .appendingPathComponent("FluidAudio", isDirectory: true) + .appendingPathComponent("Models", isDirectory: true) + } +} diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 1d4d7e9fc..e44f23513 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -7,6 +7,7 @@ public enum Repo: String, CaseIterable { case parakeetV2 = "FluidInference/parakeet-tdt-0.6b-v2-coreml" case parakeetCtc110m = "FluidInference/parakeet-ctc-110m-coreml" case parakeetCtc06b = "FluidInference/parakeet-ctc-0.6b-coreml" + case senseVoiceSmall = "FluidInference/sensevoice-small-coreml" case parakeetEou160 = "FluidInference/parakeet-realtime-eou-120m-coreml/160ms" case parakeetEou320 = "FluidInference/parakeet-realtime-eou-120m-coreml/320ms" case parakeetEou1280 = "FluidInference/parakeet-realtime-eou-120m-coreml/1280ms" @@ -35,6 +36,8 @@ public enum Repo: String, CaseIterable { return "parakeet-ctc-110m-coreml" case .parakeetCtc06b: return "parakeet-ctc-0.6b-coreml" + case .senseVoiceSmall: + return "sensevoice-small-coreml" case .parakeetEou160: return "parakeet-realtime-eou-120m-coreml/160ms" case .parakeetEou320: @@ -275,6 +278,28 @@ public enum ModelNames { ] } + /// SenseVoiceSmall CoreML model names. + public enum SenseVoice { + public static let preprocessor = "SenseVoicePreprocessor" + public static let encoder = "SenseVoiceSmall" + public static let encoderInt8 = "SenseVoiceSmall_int8" + public static let encoderFp32 = "SenseVoiceSmall_fp32" + + public static let preprocessorFile = preprocessor + ".mlmodelc" + public static let encoderFile = encoder + ".mlmodelc" + public static let encoderInt8File = encoderInt8 + ".mlmodelc" + public static let encoderFp32File = encoderFp32 + ".mlmodelc" + public static let vocabularyFile = "vocab.json" + + public static let requiredModels: Set = [ + preprocessorFile, + encoderFile, + encoderInt8File, + encoderFp32File, + vocabularyFile, + ] + } + /// Nemotron Speech Streaming 0.6B model names /// NVIDIA's streaming FastConformer RNNT with encoder cache public enum NemotronStreaming { @@ -584,6 +609,8 @@ public enum ModelNames { return ModelNames.ASR.requiredModelsFused case .parakeetCtc110m, .parakeetCtc06b: return ModelNames.CTC.requiredModels + case .senseVoiceSmall: + return ModelNames.SenseVoice.requiredModels case .parakeetEou160, .parakeetEou320, .parakeetEou1280: return ModelNames.ParakeetEOU.requiredModels case .nemotronStreaming1120, .nemotronStreaming560: