From 40ca498894aaf9cf34401d0c700018edbdb454c4 Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Fri, 1 May 2026 03:17:45 -0700
Subject: [PATCH] fix(ddpm): use _execution_device, validate inputs, free hooks
 (#13649)

Issue 1: replace self.device with self._execution_device so model_cpu_offload's
execution device is honored, and call self.maybe_free_model_hooks() before return
to satisfy the offload contract.

Issue 2: validate that len(generator) == batch_size for list generators, raising
ValueError instead of silently mishandling per-sample seeding (matches DDIM/
ConsistencyModel pipelines).

Issue 3: validate output_type and add 'pt' tensor output. Previously any value
other than 'pil' silently fell through to NumPy.

Closes #13649.
---
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 29 ++++++++++++++-----
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 6d4796cbea1f..e8bc1ea63e58 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -73,7 +73,8 @@ def __call__(
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
             output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+                The output format of the generated image. Choose between `PIL.Image`, `np.array` or
+                `torch.Tensor`.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
 
@@ -97,6 +98,17 @@ def __call__(
                 If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                 returned where the first element is a list with the generated images
         """
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"output_type must be one of ['pt', 'np', 'pil'], got '{output_type}'.")
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        device = self._execution_device
+
         # Sample gaussian noise to begin loop
         if isinstance(self.unet.config.sample_size, int):
             image_shape = (
@@ -108,12 +120,12 @@ def __call__(
         else:
             image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
 
-        if self.device.type == "mps":
+        if device.type == "mps":
             # randn does not work reproducibly on mps
             image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
-            image = image.to(self.device)
+            image = image.to(device)
         else:
-            image = randn_tensor(image_shape, generator=generator, device=self.device, dtype=self.unet.dtype)
+            image = randn_tensor(image_shape, generator=generator, device=device, dtype=self.unet.dtype)
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
@@ -129,9 +141,12 @@ def __call__(
                 xm.mark_step()
 
         image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
+        if output_type != "pt":
+            image = image.cpu().permute(0, 2, 3, 1).numpy()
+            if output_type == "pil":
+                image = self.numpy_to_pil(image)
+
+        self.maybe_free_model_hooks()
 
         if not return_dict:
             return (image,)