diff --git a/src/diffusers/models/controlnets/controlnet_z_image.py b/src/diffusers/models/controlnets/controlnet_z_image.py index 85fa0d365547..bfd3d0c8db87 100644 --- a/src/diffusers/models/controlnets/controlnet_z_image.py +++ b/src/diffusers/models/controlnets/controlnet_z_image.py @@ -597,7 +597,7 @@ def patchify_and_embed( for image, cap_feat in zip(all_image, all_cap_feats): # Caption cap_out, cap_pos_ids, cap_pad_mask, cap_len, _ = self._pad_with_ids( - cap_feat, (len(cap_feat) + (-len(cap_feat)) % SEQ_MULTI_OF, 1, 1), (1, 0, 0), device + cap_feat, (len(cap_feat), 1, 1), (1, 0, 0), device ) all_cap_out.append(cap_out) all_cap_pos_ids.append(cap_pos_ids) diff --git a/src/diffusers/models/transformers/transformer_z_image.py b/src/diffusers/models/transformers/transformer_z_image.py index ba401e7fdef1..75951285ff89 100644 --- a/src/diffusers/models/transformers/transformer_z_image.py +++ b/src/diffusers/models/transformers/transformer_z_image.py @@ -596,7 +596,7 @@ def patchify_and_embed( for image, cap_feat in zip(all_image, all_cap_feats): # Caption cap_out, cap_pos_ids, cap_pad_mask, cap_len, _ = self._pad_with_ids( - cap_feat, (len(cap_feat) + (-len(cap_feat)) % SEQ_MULTI_OF, 1, 1), (1, 0, 0), device + cap_feat, (len(cap_feat), 1, 1), (1, 0, 0), device ) all_cap_out.append(cap_out) all_cap_pos_ids.append(cap_pos_ids) @@ -651,7 +651,7 @@ def patchify_and_embed_omni( noise_val = images_noise_mask[i][j] if j < len(images_noise_mask[i]) else 1 cap_out, cap_pos, cap_mask, cap_len, cap_nm = self._pad_with_ids( cap_item, - (len(cap_item) + (-len(cap_item)) % SEQ_MULTI_OF, 1, 1), + (len(cap_item), 1, 1), (cap_cu_len, 0, 0), device, noise_val,