From 40ca498894aaf9cf34401d0c700018edbdb454c4 Mon Sep 17 00:00:00 2001 From: Tai An Date: Fri, 1 May 2026 03:17:45 -0700 Subject: [PATCH] fix(ddpm): use _execution_device, validate inputs, free hooks (#13649) Issue 1: replace self.device with self._execution_device so model_cpu_offload's execution device is honored, and call self.maybe_free_model_hooks() before return to satisfy the offload contract. Issue 2: validate that len(generator) == batch_size for list generators, raising ValueError instead of silently mishandling per-sample seeding (matches DDIM/ ConsistencyModel pipelines). Issue 3: validate output_type and add 'pt' tensor output. Previously any value other than 'pil' silently fell through to NumPy. Closes #13649. --- src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py index 6d4796cbea1f..e8bc1ea63e58 100644 --- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py +++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py @@ -73,7 +73,8 @@ def __call__( The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated image. Choose between `PIL.Image` or `np.array`. + The output format of the generated image. Choose between `PIL.Image`, `np.array` or + `torch.Tensor`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. @@ -97,6 +98,17 @@ def __call__( If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is returned where the first element is a list with the generated images """ + if output_type not in ["pt", "np", "pil"]: + raise ValueError(f"output_type must be one of ['pt', 'np', 'pil'], got '{output_type}'.") + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + device = self._execution_device + # Sample gaussian noise to begin loop if isinstance(self.unet.config.sample_size, int): image_shape = ( @@ -108,12 +120,12 @@ def __call__( else: image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size) - if self.device.type == "mps": + if device.type == "mps": # randn does not work reproducibly on mps image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype) - image = image.to(self.device) + image = image.to(device) else: - image = randn_tensor(image_shape, generator=generator, device=self.device, dtype=self.unet.dtype) + image = randn_tensor(image_shape, generator=generator, device=device, dtype=self.unet.dtype) # set step values self.scheduler.set_timesteps(num_inference_steps) @@ -129,9 +141,12 @@ def __call__( xm.mark_step() image = (image / 2 + 0.5).clamp(0, 1) - image = image.cpu().permute(0, 2, 3, 1).numpy() - if output_type == "pil": - image = self.numpy_to_pil(image) + if output_type != "pt": + image = image.cpu().permute(0, 2, 3, 1).numpy() + if output_type == "pil": + image = self.numpy_to_pil(image) + + self.maybe_free_model_hooks() if not return_dict: return (image,)