From 1a76cfdc012e3f81dd360cdfcffdb18160cd7509 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 12 May 2026 11:32:05 -0700 Subject: [PATCH] Move standalone ASR audio preprocessing module Move the standalone PyTorch LogMel implementation from the temporary nemo/asr/audio_preprocessing.py location to nemo/collections/asr/modules/audio_preprocessing_standalone.py, next to the NeMo ASR audio_preprocessing module it mirrors. The standalone module keeps NeMo-compatible LogMel feature extraction available with only stdlib and PyTorch dependencies inside the implementation. This is needed for consumers that want the NeMo LogMel behavior without depending on the full NeMo ASR stack, such as Lightning, Hydra, librosa, and NeMo neural type imports. Update the standalone tests to import the new modules path and keep coverage against nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor for output parity, filter-bank parity, dtype conversion, training dither, narrowband augmentation, CUDA execution, and longer input cases. Benchmark results on NVIDIA H100 80GB HBM3: - 1-second 16 kHz inputs, batch 64, 10 warmup, 50 measured iterations: targeted run NeMo 0.514 ms/iter, standalone 0.400 ms/iter, 1.28x speedup; full-file run NeMo 0.475 ms/iter, standalone 0.370 ms/iter, 1.28x speedup. - 10-minute 16 kHz inputs, 1 warmup, 3 measured iterations: batch 1 NeMo 1.138 ms/iter, standalone 1.067 ms/iter, 1.07x speedup, 0.41 GiB peak for both. - 10-minute 16 kHz inputs, batch 4: NeMo 4.072 ms/iter, standalone 3.869 ms/iter, 1.05x speedup, 1.56 GiB peak for both. - 10-minute 16 kHz inputs, batch 8: NeMo 7.840 ms/iter, standalone 7.501 ms/iter, 1.05x speedup, 3.08 GiB peak for both. - 10-minute 16 kHz inputs, batch 16: NeMo 15.453 ms/iter, standalone 14.780 ms/iter, 1.05x speedup, 6.13 GiB peak for both. --- .../modules/audio_preprocessing_standalone.py | 414 ++++++++++++++++ .../asr/test_audio_to_mel_standalone.py | 458 ++++++++++++++++++ 2 files changed, 872 insertions(+) create mode 100644 nemo/collections/asr/modules/audio_preprocessing_standalone.py create mode 100644 tests/collections/asr/test_audio_to_mel_standalone.py diff --git a/nemo/collections/asr/modules/audio_preprocessing_standalone.py b/nemo/collections/asr/modules/audio_preprocessing_standalone.py new file mode 100644 index 000000000000..397b6dc70468 --- /dev/null +++ b/nemo/collections/asr/modules/audio_preprocessing_standalone.py @@ -0,0 +1,414 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import random +from typing import Optional, Union + +import torch +import torch.nn as nn + +CONSTANT = 1e-5 + + +def _hz_to_mel(frequencies: torch.Tensor) -> torch.Tensor: + """Slaney mel conversion matching librosa with htk=False.""" + f_sp = 200.0 / 3 + mels = frequencies / f_sp + + min_log_hz = 1000.0 + min_log_mel = min_log_hz / f_sp + logstep = math.log(6.4) / 27.0 + + log_t = frequencies >= min_log_hz + if log_t.any(): + mels = torch.where(log_t, min_log_mel + torch.log(frequencies / min_log_hz) / logstep, mels) + return mels + + +def _mel_to_hz(mels: torch.Tensor) -> torch.Tensor: + """Inverse Slaney mel conversion matching librosa with htk=False.""" + f_sp = 200.0 / 3 + freqs = f_sp * mels + + min_log_hz = 1000.0 + min_log_mel = min_log_hz / f_sp + logstep = math.log(6.4) / 27.0 + + log_t = mels >= min_log_mel + if log_t.any(): + freqs = torch.where(log_t, min_log_hz * torch.exp(logstep * (mels - min_log_mel)), freqs) + return freqs + + +def _mel_frequencies(n_mels: int, fmin: float, fmax: float) -> torch.Tensor: + min_mel = _hz_to_mel(torch.tensor(float(fmin), dtype=torch.float64)) + max_mel = _hz_to_mel(torch.tensor(float(fmax), dtype=torch.float64)) + mels = torch.linspace(min_mel, max_mel, n_mels, dtype=torch.float64) + return _mel_to_hz(mels) + + +def _normalize_filterbank(filterbank: torch.Tensor, norm: Optional[Union[str, float]]) -> torch.Tensor: + if norm is None: + return filterbank + + if norm == "slaney": + return filterbank + + if not isinstance(norm, (int, float)): + raise ValueError(f"Unsupported mel_norm value: {norm!r}") + + norm = float(norm) + magnitudes = filterbank.abs() + if math.isinf(norm): + lengths = magnitudes.max(dim=-1, keepdim=True).values + elif norm == 0: + lengths = (magnitudes > 0).sum(dim=-1, keepdim=True).to(filterbank.dtype) + else: + lengths = magnitudes.pow(norm).sum(dim=-1, keepdim=True).pow(1.0 / norm) + + tiny = torch.finfo(filterbank.dtype).tiny + return torch.where(lengths > tiny, filterbank / lengths.clamp_min(tiny), filterbank) + + +def _create_mel_filterbank( + sample_rate: int, + n_fft: int, + n_mels: int, + fmin: float, + fmax: float, + norm: Optional[Union[str, float]], +) -> torch.Tensor: + """Create a mel filter bank equivalent to librosa.filters.mel(..., htk=False).""" + fftfreqs = torch.linspace(0, float(sample_rate) / 2, n_fft // 2 + 1, dtype=torch.float64) + mel_f = _mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax) + + fdiff = mel_f[1:] - mel_f[:-1] + ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0) + + lower = -ramps[:-2] / fdiff[:-1].unsqueeze(1) + upper = ramps[2:] / fdiff[1:].unsqueeze(1) + weights = torch.minimum(lower, upper).clamp_min(0) + + if norm == "slaney": + enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels]) + weights *= enorm.unsqueeze(1) + else: + weights = _normalize_filterbank(weights, norm) + + return weights.to(dtype=torch.float32).unsqueeze(0) + + +def normalize_batch(x: torch.Tensor, seq_len: torch.Tensor, normalize_type): + x_mean = None + x_std = None + if normalize_type == "per_feature": + batch_size = x.shape[0] + max_time = x.shape[2] + + if ( + torch.cuda.is_available() + and not torch.cuda.is_current_stream_capturing() + and torch.any(seq_len == 1).item() + ): + raise ValueError( + "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result " + "in torch.std() returning nan. Make sure your audio length has enough samples for a single feature " + "(ex. at least `hop_length` for Mel Spectrograms)." + ) + time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time) + valid_mask = time_steps < seq_len.unsqueeze(1) + x_mean_numerator = torch.where(valid_mask.unsqueeze(1), x, 0.0).sum(axis=2) + x_mean_denominator = valid_mask.sum(axis=1) + x_mean = x_mean_numerator / x_mean_denominator.unsqueeze(1) + + x_std = torch.sqrt( + torch.sum(torch.where(valid_mask.unsqueeze(1), x - x_mean.unsqueeze(2), 0.0) ** 2, axis=2) + / (x_mean_denominator.unsqueeze(1) - 1.0) + ) + x_std = x_std.masked_fill(x_std.isnan(), 0.0) + x_std += CONSTANT + normalized = (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2) + normalized.masked_fill_(~valid_mask.unsqueeze(1), 0.0) + return normalized, x_mean, x_std + elif normalize_type == "all_features": + x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) + x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) + for i in range(x.shape[0]): + x_mean[i] = x[i, :, : seq_len[i].item()].mean() + x_std[i] = x[i, :, : seq_len[i].item()].std() + x_std += CONSTANT + return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1), x_mean, x_std + elif "fixed_mean" in normalize_type and "fixed_std" in normalize_type: + x_mean = torch.tensor(normalize_type["fixed_mean"], device=x.device) + x_std = torch.tensor(normalize_type["fixed_std"], device=x.device) + return ( + (x - x_mean.view(x.shape[0], x.shape[1]).unsqueeze(2)) / x_std.view(x.shape[0], x.shape[1]).unsqueeze(2), + x_mean, + x_std, + ) + else: + return x, x_mean, x_std + + +def splice_frames(x: torch.Tensor, frame_splicing: int) -> torch.Tensor: + seq = [x] + for n in range(1, frame_splicing): + seq.append(torch.cat([x[:, :, :n], x[:, :, n:]], dim=2)) + return torch.cat(seq, dim=1) + + +class AudioToMelSpectrogramPreprocessor(nn.Module): + """Standalone PyTorch implementation of NeMo's log-mel ASR preprocessor. + + This class mirrors ``nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor`` + without importing NeMo ASR, Lightning, Hydra, librosa, or NeMo neural type + dependencies. It uses only Python stdlib and PyTorch primitives. + """ + + def __init__( + self, + sample_rate=16000, + window_size=0.02, + window_stride=0.01, + n_window_size=None, + n_window_stride=None, + window="hann", + normalize="per_feature", + n_fft=None, + preemph=0.97, + features=64, + lowfreq=0, + highfreq=None, + log=True, + log_zero_guard_type="add", + log_zero_guard_value=2**-24, + dither=1e-5, + pad_to=16, + frame_splicing=1, + exact_pad=False, + pad_value=0, + mag_power=2.0, + rng=None, + nb_augmentation_prob=0.0, + nb_max_freq=4000, + mel_norm="slaney", + use_torchaudio: bool = False, + stft_exact_pad=False, + stft_conv=False, + ): + del use_torchaudio, stft_exact_pad, stft_conv + super().__init__() + + self._sample_rate = sample_rate + if window_size and n_window_size: + raise ValueError(f"{self} received both window_size and n_window_size. Only one should be specified.") + if window_stride and n_window_stride: + raise ValueError(f"{self} received both window_stride and n_window_stride. Only one should be specified.") + if window_size: + n_window_size = int(window_size * self._sample_rate) + if window_stride: + n_window_stride = int(window_stride * self._sample_rate) + if ( + n_window_size is None + or n_window_stride is None + or not isinstance(n_window_size, int) + or not isinstance(n_window_stride, int) + or n_window_size <= 0 + or n_window_stride <= 0 + ): + raise ValueError( + f"{self} got an invalid value for either n_window_size or n_window_stride. Both must be positive ints." + ) + if exact_pad and n_window_stride % 2 == 1: + raise NotImplementedError( + f"{self} received exact_pad == True, but hop_size was odd. If audio_length % hop_size == 0. Then the " + "returned spectrogram would not be of length audio_length // hop_size. Please use an even hop_size." + ) + if log_zero_guard_type not in ["add", "clamp"]: + raise ValueError( + f"{self} received {log_zero_guard_type} for the log_zero_guard_type parameter. It must be either " + "'add' or 'clamp'." + ) + + self.win_length = n_window_size + self.hop_length = n_window_stride + self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length)) + self.stft_pad_amount = (self.n_fft - self.hop_length) // 2 if exact_pad else None + self.exact_pad = exact_pad + self.normalize = normalize + self.log = log + self.dither = dither + self.frame_splicing = frame_splicing + self.nfilt = features + self.preemph = preemph + self.pad_to = pad_to + self.pad_value = pad_value + self.mag_power = mag_power + self.log_zero_guard_type = log_zero_guard_type + self.log_zero_guard_value = log_zero_guard_value + self._rng = random.Random() if rng is None else rng + self.nb_augmentation_prob = nb_augmentation_prob + + window_fns = { + 'hann': torch.hann_window, + 'hamming': torch.hamming_window, + 'blackman': torch.blackman_window, + 'bartlett': torch.bartlett_window, + } + window_fn = window_fns.get(window, None) + window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None + self.register_buffer("window", window_tensor) + + highfreq = highfreq or sample_rate / 2 + self.register_buffer( + "fb", + _create_mel_filterbank( + sample_rate=sample_rate, + n_fft=self.n_fft, + n_mels=features, + fmin=lowfreq, + fmax=highfreq, + norm=mel_norm, + ), + ) + + max_length = self.get_seq_len(torch.tensor(16.7 * sample_rate, dtype=torch.float)) + max_pad = pad_to - (max_length % pad_to) if pad_to > 0 else 0 + self.max_length = max_length + max_pad + + if self.nb_augmentation_prob > 0.0: + if nb_max_freq >= sample_rate / 2: + self.nb_augmentation_prob = 0.0 + else: + self._nb_max_fft_bin = int((nb_max_freq / sample_rate) * self.n_fft) + + self.register_buffer("dtype_sentinel_tensor", torch.tensor((), dtype=torch.float32), persistent=False) + + @property + def filter_banks(self) -> torch.Tensor: + return self.fb + + def input_example(self, max_batch: int = 8, max_dim: int = 32000, min_length: int = 200): + dev = self.filter_banks.device + signals = torch.randn(size=[max_batch, max_dim], device=dev) + lengths = torch.randint(low=min_length, high=max_dim, size=[max_batch], device=dev) + lengths[0] = max_dim + return signals, lengths + + def get_seq_len(self, seq_len: torch.Tensor) -> torch.Tensor: + pad_amount = self.stft_pad_amount * 2 if self.stft_pad_amount is not None else self.n_fft // 2 * 2 + seq_len = torch.floor_divide((seq_len + pad_amount - self.n_fft), self.hop_length) + return seq_len.to(dtype=torch.long) + + def log_zero_guard_value_fn(self, x: torch.Tensor): + if isinstance(self.log_zero_guard_value, str): + if self.log_zero_guard_value == "tiny": + return torch.finfo(x.dtype).tiny + elif self.log_zero_guard_value == "eps": + return torch.finfo(x.dtype).eps + else: + raise ValueError( + f"{self} received {self.log_zero_guard_value} for the log_zero_guard_type parameter. It must be " + "either a number, 'tiny', or 'eps'" + ) + else: + return self.log_zero_guard_value + + def stft(self, x: torch.Tensor) -> torch.Tensor: + window = self.window.to(dtype=torch.float, device=x.device) if self.window is not None else None + return torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + center=False if self.exact_pad else True, + window=window, + return_complex=True, + pad_mode="constant", + ) + + @torch.no_grad() + def get_features(self, input_signal: torch.Tensor, length: torch.Tensor, linear_spec: bool = False): + x = input_signal + seq_len_time = length + seq_len_unfixed = self.get_seq_len(length) + seq_len = torch.where(length == 0, torch.zeros_like(seq_len_unfixed), seq_len_unfixed) + + if self.stft_pad_amount is not None: + x = torch.nn.functional.pad( + x.unsqueeze(1), (self.stft_pad_amount, self.stft_pad_amount), "constant" + ).squeeze(1) + + if self.training and self.dither > 0: + x += self.dither * torch.randn_like(x) + + if self.preemph is not None: + timemask = torch.arange(x.shape[1], device=x.device).unsqueeze(0) < seq_len_time.unsqueeze(1) + x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1) + x = x.masked_fill(~timemask, 0.0) + + with torch.amp.autocast(x.device.type, enabled=False): + x = self.stft(x) + + x = torch.view_as_real(x) + x = torch.sqrt(x.pow(2).sum(-1)) + + if self.training and self.nb_augmentation_prob > 0.0: + for idx in range(x.shape[0]): + if self._rng.random() < self.nb_augmentation_prob: + x[idx, self._nb_max_fft_bin :, :] = 0.0 + + if self.mag_power != 1.0: + x = x.pow(self.mag_power) + + if linear_spec: + return x, seq_len + + with torch.amp.autocast(x.device.type, enabled=False): + x = torch.matmul(self.fb.to(x.dtype), x) + + if self.log: + if self.log_zero_guard_type == "add": + x = torch.log(x + self.log_zero_guard_value_fn(x)) + elif self.log_zero_guard_type == "clamp": + x = torch.log(torch.clamp(x, min=self.log_zero_guard_value_fn(x))) + else: + raise ValueError("log_zero_guard_type was not understood") + + if self.frame_splicing > 1: + x = splice_frames(x, self.frame_splicing) + + if self.normalize: + x, _, _ = normalize_batch(x, seq_len, normalize_type=self.normalize) + + max_len = x.size(-1) + mask = torch.arange(max_len, device=x.device) + mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1) + x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value) + del mask + + if self.pad_to == "max": + x = nn.functional.pad(x, (0, self.max_length - x.size(-1)), value=self.pad_value) + elif self.pad_to > 0: + pad_amt = x.size(-1) % self.pad_to + if pad_amt != 0: + x = nn.functional.pad(x, (0, self.pad_to - pad_amt), value=self.pad_value) + return x, seq_len + + @torch.no_grad() + def forward(self, input_signal: torch.Tensor, length: torch.Tensor): + processed_signal, processed_length = self.get_features(input_signal.to(torch.float32), length) + processed_signal = processed_signal.to(self.dtype_sentinel_tensor.dtype) + return processed_signal, processed_length diff --git a/tests/collections/asr/test_audio_to_mel_standalone.py b/tests/collections/asr/test_audio_to_mel_standalone.py new file mode 100644 index 000000000000..08341b442a06 --- /dev/null +++ b/tests/collections/asr/test_audio_to_mel_standalone.py @@ -0,0 +1,458 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import subprocess +import sys + +import pytest +import torch + +from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor as NemoAudioToMelSpectrogramPreprocessor +from nemo.collections.asr.modules.audio_preprocessing_standalone import ( + AudioToMelSpectrogramPreprocessor as StandaloneAudioToMelSpectrogramPreprocessor, +) + + +def _make_inputs(max_length=4096): + torch.manual_seed(2026) + signal = torch.randn(4, max_length, dtype=torch.float32) + time = torch.linspace(0, 1, max_length, dtype=torch.float32) + signal[0] += 0.05 * torch.sin(2 * torch.pi * 220 * time) + signal[1] += 0.03 * torch.sin(2 * torch.pi * 440 * time) + signal[2] += torch.linspace(-0.1, 0.1, max_length) + lengths = torch.tensor([max_length, max_length - 317, max_length // 2 + 129, max_length // 4 + 73]) + return signal, lengths + + +def _make_benchmark_inputs(batch_size=64, max_length=16000, device="cuda"): + torch.manual_seed(2026) + signal = torch.randn(batch_size, max_length, dtype=torch.float32, device=device) + offsets = (torch.arange(batch_size, device=device, dtype=torch.long) * 37) % (max_length // 2) + lengths = max_length - offsets + return signal, lengths + + +def _compare_preprocessors(config, *, training=False, dtype=None, atol=2e-5, rtol=2e-5): + signal, lengths = _make_inputs() + nemo_preprocessor = NemoAudioToMelSpectrogramPreprocessor(**config) + standalone_preprocessor = StandaloneAudioToMelSpectrogramPreprocessor(**config) + + if training: + nemo_preprocessor.train() + standalone_preprocessor.train() + else: + nemo_preprocessor.eval() + standalone_preprocessor.eval() + + if dtype is not None: + nemo_preprocessor = nemo_preprocessor.to(dtype=dtype) + standalone_preprocessor = standalone_preprocessor.to(dtype=dtype) + + torch.manual_seed(12345) + nemo_features, nemo_lengths = nemo_preprocessor(input_signal=signal.clone(), length=lengths.clone()) + torch.manual_seed(12345) + standalone_features, standalone_lengths = standalone_preprocessor( + input_signal=signal.clone(), length=lengths.clone() + ) + + assert standalone_lengths.equal(nemo_lengths) + assert standalone_features.shape == nemo_features.shape + assert standalone_features.dtype == nemo_features.dtype + torch.testing.assert_close(standalone_features.float(), nemo_features.float(), atol=atol, rtol=rtol) + + +def _benchmark_cuda_forward(preprocessor, signal, lengths, *, warmup=10, iterations=50): + with torch.inference_mode(): + for _ in range(warmup): + preprocessor(input_signal=signal, length=lengths) + + torch.cuda.synchronize() + torch.cuda.reset_peak_memory_stats() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(iterations): + preprocessor(input_signal=signal, length=lengths) + end.record() + torch.cuda.synchronize() + + return start.elapsed_time(end) / iterations, torch.cuda.max_memory_allocated() + + +def _run_cuda_benchmark_case(config, *, batch_size, max_length, warmup, iterations): + signal, lengths = _make_benchmark_inputs(batch_size=batch_size, max_length=max_length) + nemo_preprocessor = NemoAudioToMelSpectrogramPreprocessor(**config).cuda().eval() + standalone_preprocessor = StandaloneAudioToMelSpectrogramPreprocessor(**config).cuda().eval() + + with torch.inference_mode(): + nemo_features, nemo_lengths = nemo_preprocessor(input_signal=signal, length=lengths) + standalone_features, standalone_lengths = standalone_preprocessor(input_signal=signal, length=lengths) + + assert nemo_features.is_cuda + assert standalone_features.is_cuda + assert standalone_lengths.equal(nemo_lengths) + torch.testing.assert_close(standalone_features.float(), nemo_features.float(), atol=1e-4, rtol=1e-4) + + nemo_ms, nemo_peak_memory = _benchmark_cuda_forward( + nemo_preprocessor, signal, lengths, warmup=warmup, iterations=iterations + ) + standalone_ms, standalone_peak_memory = _benchmark_cuda_forward( + standalone_preprocessor, signal, lengths, warmup=warmup, iterations=iterations + ) + + return { + "batch_size": batch_size, + "max_length": max_length, + "iterations": iterations, + "nemo_ms": nemo_ms, + "standalone_ms": standalone_ms, + "speed_ratio": nemo_ms / standalone_ms, + "nemo_peak_memory_gb": nemo_peak_memory / 1024**3, + "standalone_peak_memory_gb": standalone_peak_memory / 1024**3, + } + + +@pytest.mark.unit +def test_standalone_audio_to_mel_imports_from_modules_path(): + code = """ +from nemo.collections.asr.modules.audio_preprocessing_standalone import AudioToMelSpectrogramPreprocessor +AudioToMelSpectrogramPreprocessor(dither=0) +""" + env = os.environ.copy() + env["PYTHONPATH"] = os.getcwd() + os.pathsep + env.get("PYTHONPATH", "") + result = subprocess.run([sys.executable, "-c", code], capture_output=True, text=True, env=env, check=False) + + assert result.returncode == 0, result.stdout + result.stderr + + +@pytest.mark.unit +@pytest.mark.parametrize( + "config", + [ + {"dither": 0, "pad_to": 0, "normalize": None}, + { + "dither": 0, + "pad_to": 16, + "normalize": "per_feature", + "features": 80, + "lowfreq": 20, + "highfreq": 7600, + }, + { + "sample_rate": 8000, + "window_size": None, + "window_stride": None, + "n_window_size": 200, + "n_window_stride": 80, + "window": "hamming", + "normalize": "all_features", + "n_fft": 512, + "preemph": None, + "features": 40, + "lowfreq": 50, + "highfreq": 3600, + "log_zero_guard_type": "clamp", + "log_zero_guard_value": "eps", + "dither": 0, + "pad_to": 8, + "mag_power": 1.0, + }, + { + "window_size": None, + "window_stride": None, + "n_window_size": 320, + "n_window_stride": 160, + "window": "blackman", + "normalize": None, + "n_fft": 512, + "preemph": 0.95, + "features": 32, + "log": False, + "dither": 0, + "pad_to": 0, + "frame_splicing": 3, + "exact_pad": True, + "pad_value": -11.0, + }, + { + "window": "bartlett", + "normalize": { + "fixed_mean": [[-14.0] * 24, [-13.5] * 24, [-13.0] * 24, [-12.5] * 24], + "fixed_std": [[2.0] * 24, [2.1] * 24, [2.2] * 24, [2.3] * 24], + }, + "features": 24, + "lowfreq": 10, + "highfreq": 7400, + "mel_norm": None, + "dither": 0, + "pad_to": 4, + }, + ], +) +def test_standalone_audio_to_mel_matches_nemo_outputs(config): + _compare_preprocessors(config, atol=1e-4, rtol=1e-4) + + +@pytest.mark.unit +@pytest.mark.parametrize( + "config", + [ + {"features": 64, "n_fft": None, "lowfreq": 0, "highfreq": None, "mel_norm": "slaney"}, + { + "sample_rate": 8000, + "window_size": None, + "window_stride": None, + "n_window_size": 200, + "n_window_stride": 80, + "features": 40, + "n_fft": 512, + "lowfreq": 50, + "highfreq": 3600, + "mel_norm": None, + }, + ], +) +def test_standalone_audio_to_mel_filter_banks_match_nemo(config): + nemo_preprocessor = NemoAudioToMelSpectrogramPreprocessor(dither=0, **config) + standalone_preprocessor = StandaloneAudioToMelSpectrogramPreprocessor(dither=0, **config) + + torch.testing.assert_close( + standalone_preprocessor.filter_banks, + nemo_preprocessor.filter_banks, + atol=1e-7, + rtol=1e-6, + ) + + +@pytest.mark.unit +def test_standalone_audio_to_mel_matches_nemo_dtype_conversion(): + _compare_preprocessors({"dither": 0, "pad_to": 0, "normalize": None}, dtype=torch.float16, atol=1e-3, rtol=1e-3) + + +@pytest.mark.unit +def test_standalone_audio_to_mel_matches_nemo_training_dither(): + _compare_preprocessors( + {"dither": 1e-4, "pad_to": 0, "normalize": None}, + training=True, + atol=1e-4, + rtol=1e-4, + ) + + +@pytest.mark.unit +def test_standalone_audio_to_mel_matches_nemo_training_narrowband_augmentation(): + config = { + "dither": 0, + "pad_to": 0, + "normalize": None, + "n_fft": 512, + "nb_augmentation_prob": 1.0, + "nb_max_freq": 3000, + "rng": random.Random(7), + } + _compare_preprocessors(config, training=True, atol=1e-4, rtol=1e-4) + + +@pytest.mark.unit +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") +def test_standalone_audio_to_mel_cuda_benchmark_matches_nemo(capsys): + config = { + "dither": 0, + "pad_to": 0, + "normalize": None, + "features": 80, + } + batch_size = 64 + max_length = 16000 + iterations = 50 + + result = _run_cuda_benchmark_case( + config, + batch_size=batch_size, + max_length=max_length, + warmup=10, + iterations=iterations, + ) + + with capsys.disabled(): + print( + "\nLogMel CUDA benchmark " + f"(device={torch.cuda.get_device_name(0)}, batch_size={batch_size}, samples={max_length}, " + f"iterations={iterations})" + ) + print(f"NeMo AudioToMelSpectrogramPreprocessor: {result['nemo_ms']:.3f} ms/iter") + print(f"Standalone AudioToMelSpectrogramPreprocessor: {result['standalone_ms']:.3f} ms/iter") + print(f"Standalone speed vs NeMo: {result['speed_ratio']:.2f}x") + + +@pytest.mark.skipduringci +@pytest.mark.parametrize("batch_size", [1, 4, 8, 16]) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") +@pytest.mark.skipif( + os.environ.get("NEMO_RUN_LONG_LOGMEL_BENCHMARK") != "1", + reason="Set NEMO_RUN_LONG_LOGMEL_BENCHMARK=1 to run the long CUDA LogMel benchmark", +) +def test_standalone_audio_to_mel_cuda_long_benchmark_matches_nemo(batch_size, capsys): + """Compare NeMo and standalone LogMel extraction on long CUDA inputs. + + This benchmark is opt-in via ``NEMO_RUN_LONG_LOGMEL_BENCHMARK=1`` because it allocates large tensors. + On an NVIDIA H100 80GB HBM3 with 10-minute 16 kHz inputs, 1 warmup, and 3 measured iterations, + standalone was consistently faster while using the same peak memory: + + +------------+---------------+-------------------+---------+-------------+ + | Batch size | NeMo ms/iter | Standalone ms/iter| Speedup | Peak memory | + +------------+---------------+-------------------+---------+-------------+ + | 1 | 1.138 | 1.067 | 1.07x | 0.41 GiB | + | 4 | 4.072 | 3.869 | 1.05x | 1.56 GiB | + | 8 | 7.840 | 7.501 | 1.05x | 3.08 GiB | + | 16 | 15.453 | 14.780 | 1.05x | 6.13 GiB | + +------------+---------------+-------------------+---------+-------------+ + + The standalone speedup is smaller for 10-minute audio than for the 1-second batch-64 benchmark, + where standalone was about 1.28x faster. + """ + config = { + "dither": 0, + "pad_to": 0, + "normalize": None, + "features": 80, + } + sample_rate = 16000 + duration_seconds = int(os.environ.get("NEMO_LONG_LOGMEL_BENCHMARK_SECONDS", "600")) + max_length = sample_rate * duration_seconds + warmup = int(os.environ.get("NEMO_LONG_LOGMEL_BENCHMARK_WARMUP", "1")) + iterations = int(os.environ.get("NEMO_LONG_LOGMEL_BENCHMARK_ITERATIONS", "3")) + + result = _run_cuda_benchmark_case( + config, + batch_size=batch_size, + max_length=max_length, + warmup=warmup, + iterations=iterations, + ) + + with capsys.disabled(): + print( + "\nLong LogMel CUDA benchmark " + f"(device={torch.cuda.get_device_name(0)}, batch_size={batch_size}, " + f"duration_seconds={duration_seconds}, samples={max_length}, iterations={iterations})" + ) + print( + f"NeMo AudioToMelSpectrogramPreprocessor: {result['nemo_ms']:.3f} ms/iter, " + f"peak={result['nemo_peak_memory_gb']:.2f} GiB" + ) + print( + f"Standalone AudioToMelSpectrogramPreprocessor: {result['standalone_ms']:.3f} ms/iter, " + f"peak={result['standalone_peak_memory_gb']:.2f} GiB" + ) + print(f"Standalone speed vs NeMo: {result['speed_ratio']:.2f}x") + + +def _make_30s_inputs(sample_rate=16000): + """Create test inputs for 30-second audio clips.""" + max_length = 30 * sample_rate # 480,000 samples at 16kHz + torch.manual_seed(2026) + signal = torch.randn(32, max_length, dtype=torch.float32) + time = torch.linspace(0, 30, max_length, dtype=torch.float32) + # Add some structured signals + signal[0] += 0.05 * torch.sin(2 * torch.pi * 220 * time) + signal[1] += 0.03 * torch.sin(2 * torch.pi * 440 * time) + signal[2] += torch.linspace(-0.1, 0.1, max_length) + # Variable lengths simulating different clip durations + lengths = torch.tensor([max_length - i * 10000 for i in range(32)]) + lengths = torch.clamp(lengths, min=max_length // 4) + return signal, lengths + + +def _make_5min_inputs(sample_rate=16000): + """Create test inputs for 5-minute audio clips.""" + max_length = 5 * 60 * sample_rate # 4,800,000 samples at 16kHz + torch.manual_seed(2026) + signal = torch.randn(8, max_length, dtype=torch.float32) + time = torch.linspace(0, 300, max_length, dtype=torch.float32) # 300 seconds = 5 minutes + # Add some structured signals + signal[0] += 0.05 * torch.sin(2 * torch.pi * 220 * time) + signal[1] += 0.03 * torch.sin(2 * torch.pi * 440 * time) + signal[2] += torch.linspace(-0.1, 0.1, max_length) + # Variable lengths simulating different recording durations + lengths = torch.tensor([max_length - i * 50000 for i in range(8)]) + lengths = torch.clamp(lengths, min=max_length // 4) + return signal, lengths + + +@pytest.mark.unit +def test_standalone_audio_to_mel_matches_nemo_30s_clips(): + """Test with realistic 30-second audio clips (32 samples).""" + config = { + "dither": 0, + "pad_to": 0, + "normalize": "per_feature", + "features": 80, + "lowfreq": 0, + "highfreq": None, + } + + signal, lengths = _make_30s_inputs() + nemo_preprocessor = NemoAudioToMelSpectrogramPreprocessor(**config) + standalone_preprocessor = StandaloneAudioToMelSpectrogramPreprocessor(**config) + + nemo_preprocessor.eval() + standalone_preprocessor.eval() + + torch.manual_seed(12345) + nemo_features, nemo_lengths = nemo_preprocessor(input_signal=signal.clone(), length=lengths.clone()) + torch.manual_seed(12345) + standalone_features, standalone_lengths = standalone_preprocessor( + input_signal=signal.clone(), length=lengths.clone() + ) + + assert standalone_lengths.equal(nemo_lengths) + assert standalone_features.shape == nemo_features.shape + assert standalone_features.dtype == nemo_features.dtype + torch.testing.assert_close(standalone_features.float(), nemo_features.float(), atol=1e-4, rtol=1e-4) + + +@pytest.mark.unit +def test_standalone_audio_to_mel_matches_nemo_5min_recordings(): + """Test with realistic 5-minute audio recordings (8 samples).""" + config = { + "dither": 0, + "pad_to": 0, + "normalize": "all_features", + "features": 64, + "lowfreq": 0, + "highfreq": None, + } + + signal, lengths = _make_5min_inputs() + nemo_preprocessor = NemoAudioToMelSpectrogramPreprocessor(**config) + standalone_preprocessor = StandaloneAudioToMelSpectrogramPreprocessor(**config) + + nemo_preprocessor.eval() + standalone_preprocessor.eval() + + torch.manual_seed(12345) + nemo_features, nemo_lengths = nemo_preprocessor(input_signal=signal.clone(), length=lengths.clone()) + torch.manual_seed(12345) + standalone_features, standalone_lengths = standalone_preprocessor( + input_signal=signal.clone(), length=lengths.clone() + ) + + assert standalone_lengths.equal(nemo_lengths) + assert standalone_features.shape == nemo_features.shape + assert standalone_features.dtype == nemo_features.dtype + torch.testing.assert_close(standalone_features.float(), nemo_features.float(), atol=1e-4, rtol=1e-4)