From f2f294fd96d703475efb10d01d10909e9c2e5dd4 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Thu, 30 Apr 2026 00:26:40 -0700
Subject: [PATCH] fix(stable_audio): align batched initial audio with prompts
 in prepare_latents

---
 src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py b/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py
index 351c8b65de0e..5de0bf059987 100644
--- a/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py
+++ b/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py
@@ -483,7 +483,7 @@ def prepare_latents(
             audio[:, :, : min(audio_length, audio_vae_length)] = initial_audio_waveforms[:, :, :audio_vae_length]
 
             encoded_audio = self.vae.encode(audio).latent_dist.sample(generator)
-            encoded_audio = encoded_audio.repeat((num_waveforms_per_prompt, 1, 1))
+            encoded_audio = encoded_audio.repeat_interleave(num_waveforms_per_prompt, dim=0)
             latents = encoded_audio + latents
         return latents