Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions Sources/FluidAudio/ASR/SenseVoice/SenseVoiceConfig.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import Foundation

/// Configuration constants for the SenseVoiceSmall CoreML pipeline.
///
/// SenseVoiceSmall is non-autoregressive: a SANM encoder + single CTC head.
/// The CoreML export is a 3-stage pipeline: preprocessor, encoder+CTC, then
/// host-side greedy decode.
public enum SenseVoiceConfig {
/// LFR feature dimension (80-bin fbank * LFR m=7).
public static let featureDim = 560

/// Enumerated encoder sequence-length buckets after LFR framing.
public static let buckets = [128, 256, 512, 1024, 1800]

/// Query tokens prepended by the encoder: language, emotion, event, text norm.
public static let numQueryTokens = 4

/// SenseVoice uses `<unk>` = 0 as the CTC blank.
public static let blankId = 0

/// Auto-detect language.
public static let defaultLanguage: Int32 = 0

/// English language embedding index in FunASR/SenseVoice's `lid_dict`.
public static let englishLanguage: Int32 = 4

/// `woitn`: no inverse text normalization.
public static let defaultTextNorm: Int32 = 15

public static let sampleRate = 16_000

/// Kaldi feeds waveforms in int16 range; AudioConverter yields [-1, 1].
public static let waveformScale: Float = 32_768.0

/// Largest supported feature length, about 108 seconds of audio.
public static var maxFrames: Int { buckets.last ?? 1800 }

public static func pickBucket(forFrames frames: Int) -> Int {
for bucket in buckets where bucket >= frames { return bucket }
return buckets.last ?? 1800
}
}
152 changes: 152 additions & 0 deletions Sources/FluidAudio/ASR/SenseVoice/SenseVoiceManager.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
@preconcurrency import CoreML
import Foundation

/// Manager for SenseVoiceSmall transcription.
///
/// Pipeline: waveform -> preprocessor -> LFR features -> encoder+CTC logits
/// -> host greedy CTC decode.
public actor SenseVoiceManager {
private let models: SenseVoiceModels
private let language: Int32
private let textNorm: Int32
private static let logger = AppLogger(category: "SenseVoiceManager")

public init(
models: SenseVoiceModels,
language: Int32 = SenseVoiceConfig.defaultLanguage,
textNorm: Int32 = SenseVoiceConfig.defaultTextNorm
) {
self.models = models
self.language = language
self.textNorm = textNorm
}

public static func load(
precision: SenseVoiceEncoderPrecision = .fp16,
language: Int32 = SenseVoiceConfig.defaultLanguage,
textNorm: Int32 = SenseVoiceConfig.defaultTextNorm,
progressHandler: DownloadUtils.ProgressHandler? = nil
) async throws -> SenseVoiceManager {
let models = try await SenseVoiceModels.downloadAndLoad(
precision: precision,
progressHandler: progressHandler
)
return SenseVoiceManager(models: models, language: language, textNorm: textNorm)
}

/// Transcribe a 16 kHz mono audio file.
public func transcribe(audioURL: URL) throws -> String {
let converter = AudioConverter(sampleRate: Double(SenseVoiceConfig.sampleRate))
let samples = try converter.resampleAudioFile(audioURL)
return try transcribe(audio: samples)
}

/// Transcribe 16 kHz mono float samples in [-1, 1].
public func transcribe(audio: [Float]) throws -> String {
let features = try runPreprocessor(audio: audio)
let (logits, validFrames) = try runEncoder(features: features)
return decode(logits: logits, validFrames: validFrames)
}

private func runPreprocessor(audio: [Float]) throws -> MLMultiArray {
let waveform = try MLMultiArray(shape: [1, audio.count as NSNumber], dataType: .float32)
let scale = SenseVoiceConfig.waveformScale
let pointer = waveform.dataPointer.assumingMemoryBound(to: Float32.self)
for index in 0..<audio.count {
pointer[index] = audio[index] * scale
}

let input = try MLDictionaryFeatureProvider(dictionary: [
"waveform": MLFeatureValue(multiArray: waveform)
])
let output = try models.preprocessor.prediction(from: input)
guard let features = output.featureValue(for: "features")?.multiArrayValue else {
throw ASRError.processingFailed("SenseVoice preprocessor produced no features.")
}
return features
}

private func runEncoder(features: MLMultiArray) throws -> (MLMultiArray, Int) {
let dimension = SenseVoiceConfig.featureDim
var frameCount = features.shape[1].intValue
if frameCount > SenseVoiceConfig.maxFrames {
Self.logger.warning("Audio exceeds max length; truncating \(frameCount) to \(SenseVoiceConfig.maxFrames) frames")
frameCount = SenseVoiceConfig.maxFrames
}
let bucket = SenseVoiceConfig.pickBucket(forFrames: frameCount)

let speech = try MLMultiArray(
shape: [1, bucket as NSNumber, dimension as NSNumber],
dataType: .float32
)
let speechPointer = speech.dataPointer.assumingMemoryBound(to: Float32.self)
memset(speechPointer, 0, bucket * dimension * MemoryLayout<Float32>.size)

let copiedCount = frameCount * dimension
if features.dataType == .float32 {
memcpy(speechPointer, features.dataPointer, copiedCount * MemoryLayout<Float32>.size)
} else {
for index in 0..<copiedCount {
speechPointer[index] = features[index].floatValue
}
}

let lengths = try MLMultiArray(shape: [1], dataType: .int32)
lengths[0] = NSNumber(value: frameCount)
let languageArray = try MLMultiArray(shape: [1], dataType: .int32)
languageArray[0] = NSNumber(value: language)
let textNormArray = try MLMultiArray(shape: [1], dataType: .int32)
textNormArray[0] = NSNumber(value: textNorm)

let input = try MLDictionaryFeatureProvider(dictionary: [
"speech": MLFeatureValue(multiArray: speech),
"speech_lengths": MLFeatureValue(multiArray: lengths),
"language": MLFeatureValue(multiArray: languageArray),
"textnorm": MLFeatureValue(multiArray: textNormArray),
])
let output = try models.encoder.prediction(from: input)
guard let logits = output.featureValue(for: "ctc_logits")?.multiArrayValue else {
throw ASRError.processingFailed("SenseVoice encoder produced no ctc_logits.")
}
return (logits, SenseVoiceConfig.numQueryTokens + frameCount)
}

private func decode(logits: MLMultiArray, validFrames: Int) -> String {
let vocabularySize = logits.shape[2].intValue
let frames = min(validFrames, logits.shape[1].intValue)
var ids: [Int] = []
var previous = -1

func appendArgmax(valueAt: (Int) -> Float) {
var best = 0
var bestValue = valueAt(0)
for token in 1..<vocabularySize {
let value = valueAt(token)
if value > bestValue {
bestValue = value
best = token
}
}
if best != SenseVoiceConfig.blankId && best != previous {
ids.append(best)
}
previous = best
}

if logits.dataType == .float32 {
let pointer = logits.dataPointer.assumingMemoryBound(to: Float32.self)
for frame in 0..<frames {
let base = frame * vocabularySize
appendArgmax { pointer[base + $0] }
}
} else {
for frame in 0..<frames {
appendArgmax { logits[[0, frame as NSNumber, $0 as NSNumber]].floatValue }
}
}

return decodeCtcTokenIds(ids, vocabulary: models.vocabulary)
.replacingOccurrences(of: "<\\|[^|]*\\|>", with: "", options: .regularExpression)
.trimmingCharacters(in: .whitespacesAndNewlines)
}
}
155 changes: 155 additions & 0 deletions Sources/FluidAudio/ASR/SenseVoice/SenseVoiceModels.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
@preconcurrency import CoreML
import Foundation

/// SenseVoice encoder weight precision. fp16 and int8 run on the Neural Engine;
/// fp32 is the non-ANE fallback.
public enum SenseVoiceEncoderPrecision: String, Sendable {
case fp16
case int8
case fp32

var modelName: String {
switch self {
case .fp16: return ModelNames.SenseVoice.encoder
case .int8: return ModelNames.SenseVoice.encoderInt8
case .fp32: return ModelNames.SenseVoice.encoderFp32
}
}

var computeUnits: MLComputeUnits {
self == .fp32 ? .all : .cpuAndNeuralEngine
}
}

/// Loaded SenseVoiceSmall CoreML models plus vocabulary.
public struct SenseVoiceModels: Sendable {
public let preprocessor: MLModel
public let encoder: MLModel
public let vocabulary: [Int: String]

private static let logger = AppLogger(category: "SenseVoiceModels")

public init(preprocessor: MLModel, encoder: MLModel, vocabulary: [Int: String]) {
self.preprocessor = preprocessor
self.encoder = encoder
self.vocabulary = vocabulary
}

public static func downloadAndLoad(
precision: SenseVoiceEncoderPrecision = .fp16,
progressHandler: DownloadUtils.ProgressHandler? = nil
) async throws -> SenseVoiceModels {
let directory = try await download(precision: precision, progressHandler: progressHandler)
return try load(from: directory, precision: precision)
}

public static func download(
precision: SenseVoiceEncoderPrecision = .fp16,
force: Bool = false,
progressHandler: DownloadUtils.ProgressHandler? = nil
) async throws -> URL {
let modelsRoot = modelsRootDirectory()
let targetDir = modelsRoot.appendingPathComponent(Repo.senseVoiceSmall.folderName, isDirectory: true)

if !force && modelsExist(at: targetDir, precision: precision) {
logger.info("SenseVoice models already present at: \(targetDir.path)")
return targetDir
}
if force { try? FileManager.default.removeItem(at: targetDir) }

logger.info("Downloading SenseVoice models from Hugging Face")
try await DownloadUtils.downloadRepo(.senseVoiceSmall, to: modelsRoot, progressHandler: progressHandler)
logger.info("Successfully downloaded SenseVoice models")
return targetDir
}

public static func modelsExist(
at directory: URL,
precision: SenseVoiceEncoderPrecision = .fp16
) -> Bool {
let required = [
ModelNames.SenseVoice.preprocessorFile,
precision.modelName + ".mlmodelc",
ModelNames.SenseVoice.vocabularyFile,
]
return required.allSatisfy {
FileManager.default.fileExists(atPath: directory.appendingPathComponent($0).path)
}
}

public static func load(
from directory: URL,
precision: SenseVoiceEncoderPrecision = .fp16
) throws -> SenseVoiceModels {
let cpuConfig = MLModelConfiguration()
cpuConfig.computeUnits = .cpuOnly

let encoderConfig = MLModelConfiguration()
encoderConfig.computeUnits = precision.computeUnits

let preprocessor = try loadModel(
named: ModelNames.SenseVoice.preprocessor,
from: directory,
configuration: cpuConfig
)
let encoder = try loadModel(
named: precision.modelName,
from: directory,
configuration: encoderConfig
)
let vocabulary = try loadVocabulary(from: directory)

logger.info("Loaded SenseVoice (encoder: \(precision.rawValue), vocab: \(vocabulary.count))")
return SenseVoiceModels(preprocessor: preprocessor, encoder: encoder, vocabulary: vocabulary)
}

private static func loadModel(
named name: String,
from directory: URL,
configuration: MLModelConfiguration
) throws -> MLModel {
let compiledPath = directory.appendingPathComponent("\(name).mlmodelc")
let packagePath = directory.appendingPathComponent("\(name).mlpackage")
let modelURL: URL

if FileManager.default.fileExists(atPath: compiledPath.path) {
modelURL = compiledPath
} else if FileManager.default.fileExists(atPath: packagePath.path) {
modelURL = try MLModel.compileModel(at: packagePath)
} else {
throw ASRError.processingFailed("SenseVoice model not found: \(name)")
}

return try MLModel(contentsOf: modelURL, configuration: configuration)
}

private static func loadVocabulary(from directory: URL) throws -> [Int: String] {
let path = directory.appendingPathComponent(ModelNames.SenseVoice.vocabularyFile)
let data = try Data(contentsOf: path)

if let array = try? JSONSerialization.jsonObject(with: data) as? [String] {
return Dictionary(uniqueKeysWithValues: array.enumerated().map { index, token in (index, token) })
}

if let dict = try? JSONSerialization.jsonObject(with: data) as? [String: String] {
return Dictionary(uniqueKeysWithValues: dict.compactMap { key, token in
guard let index = Int(key) else { return nil }
return (index, token)
})
}

throw ASRError.processingFailed("Failed to parse SenseVoice vocab.json.")
}

private static func modelsRootDirectory() -> URL {
let fileManager = FileManager.default
if let appSupport = fileManager.urls(for: .applicationSupportDirectory, in: .userDomainMask).first {
return appSupport
.appendingPathComponent("FluidAudio", isDirectory: true)
.appendingPathComponent("Models", isDirectory: true)
}
return fileManager.temporaryDirectory
.appendingPathComponent("FluidAudio", isDirectory: true)
.appendingPathComponent("Models", isDirectory: true)
}
}
Loading