diff --git a/tests/pipelines/amused/__init__.py b/tests/pipelines/amused/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/amused/test_amused.py b/tests/pipelines/amused/test_amused.py deleted file mode 100644 index 94759d1f2002..000000000000 --- a/tests/pipelines/amused/test_amused.py +++ /dev/null @@ -1,171 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer - -from diffusers import AmusedPipeline, AmusedScheduler, UVit2DModel, VQModel -from diffusers.utils.testing_utils import ( - enable_full_determinism, - require_torch_accelerator, - slow, - torch_device, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class AmusedPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = AmusedPipeline - params = TEXT_TO_IMAGE_PARAMS | {"encoder_hidden_states", "negative_encoder_hidden_states"} - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - test_layerwise_casting = True - test_group_offloading = True - - def get_dummy_components(self): - torch.manual_seed(0) - transformer = UVit2DModel( - hidden_size=8, - use_bias=False, - hidden_dropout=0.0, - cond_embed_dim=8, - micro_cond_encode_dim=2, - micro_cond_embed_dim=10, - encoder_hidden_size=8, - vocab_size=32, - codebook_size=8, - in_channels=8, - block_out_channels=8, - num_res_blocks=1, - downsample=True, - upsample=True, - block_num_heads=1, - num_hidden_layers=1, - num_attention_heads=1, - attention_dropout=0.0, - intermediate_size=8, - layer_norm_eps=1e-06, - ln_elementwise_affine=True, - ) - scheduler = AmusedScheduler(mask_token_id=31) - torch.manual_seed(0) - vqvae = VQModel( - act_fn="silu", - block_out_channels=[8], - down_block_types=["DownEncoderBlock2D"], - in_channels=3, - latent_channels=8, - layers_per_block=1, - norm_num_groups=8, - num_vq_embeddings=8, - out_channels=3, - sample_size=8, - up_block_types=["UpDecoderBlock2D"], - mid_block_add_attention=False, - lookup_from_codebook=True, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=8, - intermediate_size=8, - layer_norm_eps=1e-05, - num_attention_heads=1, - num_hidden_layers=1, - pad_token_id=1, - vocab_size=1000, - projection_dim=8, - ) - text_encoder = CLIPTextModelWithProjection(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - components = { - "transformer": transformer, - "scheduler": scheduler, - "vqvae": vqvae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "output_type": "np", - "height": 4, - "width": 4, - } - return inputs - - def test_inference_batch_consistent(self, batch_sizes=[2]): - self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False) - - @unittest.skip("aMUSEd does not support lists of generators") - def test_inference_batch_single_identical(self): ... - - -@slow -@require_torch_accelerator -class AmusedPipelineSlowTests(unittest.TestCase): - def test_amused_256(self): - pipe = AmusedPipeline.from_pretrained("amused/amused-256") - pipe.to(torch_device) - image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images - image_slice = image[0, -3:, -3:, -1].flatten() - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.4011, 0.3992, 0.379, 0.3856, 0.3772, 0.3711, 0.3919, 0.385, 0.3625]) - assert np.abs(image_slice - expected_slice).max() < 0.003 - - def test_amused_256_fp16(self): - pipe = AmusedPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16) - pipe.to(torch_device) - image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images - image_slice = image[0, -3:, -3:, -1].flatten() - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.0554, 0.05129, 0.0344, 0.0452, 0.0476, 0.0271, 0.0495, 0.0527, 0.0158]) - assert np.abs(image_slice - expected_slice).max() < 0.007 - - def test_amused_512(self): - pipe = AmusedPipeline.from_pretrained("amused/amused-512") - pipe.to(torch_device) - image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.1199, 0.1171, 0.1229, 0.1188, 0.1210, 0.1147, 0.1260, 0.1346, 0.1152]) - assert np.abs(image_slice - expected_slice).max() < 0.003 - - def test_amused_512_fp16(self): - pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16) - pipe.to(torch_device) - image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.1509, 0.1492, 0.1531, 0.1485, 0.1501, 0.1465, 0.1581, 0.1690, 0.1499]) - assert np.abs(image_slice - expected_slice).max() < 0.003 diff --git a/tests/pipelines/amused/test_amused_img2img.py b/tests/pipelines/amused/test_amused_img2img.py deleted file mode 100644 index a76d82a2f09c..000000000000 --- a/tests/pipelines/amused/test_amused_img2img.py +++ /dev/null @@ -1,215 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer - -from diffusers import AmusedImg2ImgPipeline, AmusedScheduler, UVit2DModel, VQModel -from diffusers.utils import load_image -from diffusers.utils.testing_utils import ( - enable_full_determinism, - require_torch_accelerator, - slow, - torch_device, -) - -from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class AmusedImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = AmusedImg2ImgPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "latents"} - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - - def get_dummy_components(self): - torch.manual_seed(0) - transformer = UVit2DModel( - hidden_size=8, - use_bias=False, - hidden_dropout=0.0, - cond_embed_dim=8, - micro_cond_encode_dim=2, - micro_cond_embed_dim=10, - encoder_hidden_size=8, - vocab_size=32, - codebook_size=8, - in_channels=8, - block_out_channels=8, - num_res_blocks=1, - downsample=True, - upsample=True, - block_num_heads=1, - num_hidden_layers=1, - num_attention_heads=1, - attention_dropout=0.0, - intermediate_size=8, - layer_norm_eps=1e-06, - ln_elementwise_affine=True, - ) - scheduler = AmusedScheduler(mask_token_id=31) - torch.manual_seed(0) - vqvae = VQModel( - act_fn="silu", - block_out_channels=[8], - down_block_types=["DownEncoderBlock2D"], - in_channels=3, - latent_channels=8, - layers_per_block=1, - norm_num_groups=8, - num_vq_embeddings=32, - out_channels=3, - sample_size=8, - up_block_types=["UpDecoderBlock2D"], - mid_block_add_attention=False, - lookup_from_codebook=True, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=8, - intermediate_size=8, - layer_norm_eps=1e-05, - num_attention_heads=1, - num_hidden_layers=1, - pad_token_id=1, - vocab_size=1000, - projection_dim=8, - ) - text_encoder = CLIPTextModelWithProjection(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - components = { - "transformer": transformer, - "scheduler": scheduler, - "vqvae": vqvae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "output_type": "np", - "image": image, - } - return inputs - - def test_inference_batch_consistent(self, batch_sizes=[2]): - self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False) - - @unittest.skip("aMUSEd does not support lists of generators") - def test_inference_batch_single_identical(self): ... - - -@slow -@require_torch_accelerator -class AmusedImg2ImgPipelineSlowTests(unittest.TestCase): - def test_amused_256(self): - pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256") - pipe.to(torch_device) - image = ( - load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg") - .resize((256, 256)) - .convert("RGB") - ) - image = pipe( - "winter mountains", - image, - generator=torch.Generator().manual_seed(0), - num_inference_steps=2, - output_type="np", - ).images - image_slice = image[0, -3:, -3:, -1].flatten() - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.9993, 1.0, 0.9996, 1.0, 0.9995, 0.9925, 0.999, 0.9954, 1.0]) - assert np.abs(image_slice - expected_slice).max() < 0.01 - - def test_amused_256_fp16(self): - pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256", torch_dtype=torch.float16, variant="fp16") - pipe.to(torch_device) - image = ( - load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg") - .resize((256, 256)) - .convert("RGB") - ) - image = pipe( - "winter mountains", - image, - generator=torch.Generator().manual_seed(0), - num_inference_steps=2, - output_type="np", - ).images - image_slice = image[0, -3:, -3:, -1].flatten() - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.998, 0.998, 0.994, 0.9944, 0.996, 0.9908, 1.0, 1.0, 0.9986]) - assert np.abs(image_slice - expected_slice).max() < 0.01 - - def test_amused_512(self): - pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512") - pipe.to(torch_device) - image = ( - load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg") - .resize((512, 512)) - .convert("RGB") - ) - image = pipe( - "winter mountains", - image, - generator=torch.Generator().manual_seed(0), - num_inference_steps=2, - output_type="np", - ).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.2809, 0.1879, 0.2027, 0.2418, 0.1852, 0.2145, 0.2484, 0.2425, 0.2317]) - assert np.abs(image_slice - expected_slice).max() < 0.1 - - def test_amused_512_fp16(self): - pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16) - pipe.to(torch_device) - image = ( - load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg") - .resize((512, 512)) - .convert("RGB") - ) - image = pipe( - "winter mountains", - image, - generator=torch.Generator().manual_seed(0), - num_inference_steps=2, - output_type="np", - ).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.2795, 0.1867, 0.2028, 0.2450, 0.1856, 0.2140, 0.2473, 0.2406, 0.2313]) - assert np.abs(image_slice - expected_slice).max() < 0.1 diff --git a/tests/pipelines/amused/test_amused_inpaint.py b/tests/pipelines/amused/test_amused_inpaint.py deleted file mode 100644 index 0b025b8a3f83..000000000000 --- a/tests/pipelines/amused/test_amused_inpaint.py +++ /dev/null @@ -1,281 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer - -from diffusers import AmusedInpaintPipeline, AmusedScheduler, UVit2DModel, VQModel -from diffusers.utils import load_image -from diffusers.utils.testing_utils import ( - Expectations, - enable_full_determinism, - require_torch_accelerator, - slow, - torch_device, -) - -from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class AmusedInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = AmusedInpaintPipeline - params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"} - batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - - def get_dummy_components(self): - torch.manual_seed(0) - transformer = UVit2DModel( - hidden_size=8, - use_bias=False, - hidden_dropout=0.0, - cond_embed_dim=8, - micro_cond_encode_dim=2, - micro_cond_embed_dim=10, - encoder_hidden_size=8, - vocab_size=32, - codebook_size=32, - in_channels=8, - block_out_channels=8, - num_res_blocks=1, - downsample=True, - upsample=True, - block_num_heads=1, - num_hidden_layers=1, - num_attention_heads=1, - attention_dropout=0.0, - intermediate_size=8, - layer_norm_eps=1e-06, - ln_elementwise_affine=True, - ) - scheduler = AmusedScheduler(mask_token_id=31) - torch.manual_seed(0) - vqvae = VQModel( - act_fn="silu", - block_out_channels=[8], - down_block_types=["DownEncoderBlock2D"], - in_channels=3, - latent_channels=8, - layers_per_block=1, - norm_num_groups=8, - num_vq_embeddings=32, - out_channels=3, - sample_size=8, - up_block_types=["UpDecoderBlock2D"], - mid_block_add_attention=False, - lookup_from_codebook=True, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=8, - intermediate_size=8, - layer_norm_eps=1e-05, - num_attention_heads=1, - num_hidden_layers=1, - pad_token_id=1, - vocab_size=1000, - projection_dim=8, - ) - text_encoder = CLIPTextModelWithProjection(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - components = { - "transformer": transformer, - "scheduler": scheduler, - "vqvae": vqvae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device) - mask_image = torch.full((1, 1, 4, 4), 1.0, dtype=torch.float32, device=device) - mask_image[0, 0, 0, 0] = 0 - mask_image[0, 0, 0, 1] = 0 - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "output_type": "np", - "image": image, - "mask_image": mask_image, - } - return inputs - - def test_inference_batch_consistent(self, batch_sizes=[2]): - self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False) - - @unittest.skip("aMUSEd does not support lists of generators") - def test_inference_batch_single_identical(self): ... - - -@slow -@require_torch_accelerator -class AmusedInpaintPipelineSlowTests(unittest.TestCase): - def test_amused_256(self): - pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256") - pipe.to(torch_device) - image = ( - load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg") - .resize((256, 256)) - .convert("RGB") - ) - mask_image = ( - load_image( - "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png" - ) - .resize((256, 256)) - .convert("L") - ) - image = pipe( - "winter mountains", - image, - mask_image, - generator=torch.Generator().manual_seed(0), - num_inference_steps=2, - output_type="np", - ).images - image_slice = image[0, -3:, -3:, -1].flatten() - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.0699, 0.0716, 0.0608, 0.0715, 0.0797, 0.0638, 0.0802, 0.0924, 0.0634]) - assert np.abs(image_slice - expected_slice).max() < 0.1 - - def test_amused_256_fp16(self): - pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16) - pipe.to(torch_device) - image = ( - load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg") - .resize((256, 256)) - .convert("RGB") - ) - mask_image = ( - load_image( - "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png" - ) - .resize((256, 256)) - .convert("L") - ) - image = pipe( - "winter mountains", - image, - mask_image, - generator=torch.Generator().manual_seed(0), - num_inference_steps=2, - output_type="np", - ).images - image_slice = image[0, -3:, -3:, -1].flatten() - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.0735, 0.0749, 0.065, 0.0739, 0.0805, 0.0667, 0.0802, 0.0923, 0.0622]) - assert np.abs(image_slice - expected_slice).max() < 0.1 - - def test_amused_512(self): - pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512") - pipe.to(torch_device) - image = ( - load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg") - .resize((512, 512)) - .convert("RGB") - ) - mask_image = ( - load_image( - "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png" - ) - .resize((512, 512)) - .convert("L") - ) - image = pipe( - "winter mountains", - image, - mask_image, - generator=torch.Generator().manual_seed(0), - num_inference_steps=2, - output_type="np", - ).images - image_slice = image[0, -3:, -3:, -1].flatten() - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005, 0.0]) - assert np.abs(image_slice - expected_slice).max() < 0.05 - - def test_amused_512_fp16(self): - pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16) - pipe.to(torch_device) - image = ( - load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg") - .resize((512, 512)) - .convert("RGB") - ) - mask_image = ( - load_image( - "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png" - ) - .resize((512, 512)) - .convert("L") - ) - image = pipe( - "winter mountains", - image, - mask_image, - generator=torch.Generator().manual_seed(0), - num_inference_steps=2, - output_type="np", - ).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slices = Expectations( - { - ("xpu", 3): np.array( - [ - 0.0274, - 0.0211, - 0.0154, - 0.0257, - 0.0299, - 0.0170, - 0.0326, - 0.0420, - 0.0150, - ] - ), - ("cuda", 7): np.array( - [ - 0.0227, - 0.0157, - 0.0098, - 0.0213, - 0.0250, - 0.0127, - 0.0280, - 0.0380, - 0.0095, - ] - ), - } - ) - expected_slice = expected_slices.get_expectation() - assert np.abs(image_slice - expected_slice).max() < 0.003 diff --git a/tests/pipelines/audioldm/__init__.py b/tests/pipelines/audioldm/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py deleted file mode 100644 index eb4139f0dc3d..000000000000 --- a/tests/pipelines/audioldm/test_audioldm.py +++ /dev/null @@ -1,461 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import gc -import unittest - -import numpy as np -import torch -import torch.nn.functional as F -from transformers import ( - ClapTextConfig, - ClapTextModelWithProjection, - RobertaTokenizer, - SpeechT5HifiGan, - SpeechT5HifiGanConfig, -) - -from diffusers import ( - AudioLDMPipeline, - AutoencoderKL, - DDIMScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - UNet2DConditionModel, -) -from diffusers.utils import is_xformers_available -from diffusers.utils.testing_utils import backend_empty_cache, enable_full_determinism, nightly, torch_device - -from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = AudioLDMPipeline - params = TEXT_TO_AUDIO_PARAMS - batch_params = TEXT_TO_AUDIO_BATCH_PARAMS - required_optional_params = frozenset( - [ - "num_inference_steps", - "num_waveforms_per_prompt", - "generator", - "latents", - "output_type", - "return_dict", - "callback", - "callback_steps", - ] - ) - - supports_dduf = False - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(8, 16), - layers_per_block=1, - norm_num_groups=8, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=(8, 16), - class_embed_type="simple_projection", - projection_class_embeddings_input_dim=8, - class_embeddings_concat=True, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[8, 16], - in_channels=1, - out_channels=1, - norm_num_groups=8, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = ClapTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=8, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=1, - num_hidden_layers=1, - pad_token_id=1, - vocab_size=1000, - projection_dim=8, - ) - text_encoder = ClapTextModelWithProjection(text_encoder_config) - tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77) - - vocoder_config = SpeechT5HifiGanConfig( - model_in_dim=8, - sampling_rate=16000, - upsample_initial_channel=16, - upsample_rates=[2, 2], - upsample_kernel_sizes=[4, 4], - resblock_kernel_sizes=[3, 7], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]], - normalize_before=False, - ) - - vocoder = SpeechT5HifiGan(vocoder_config) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "vocoder": vocoder, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A hammer hitting a wooden surface", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - } - return inputs - - def test_audioldm_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = audioldm_pipe(**inputs) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) == 256 - - audio_slice = audio[:10] - expected_slice = np.array( - [-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033] - ) - - assert np.abs(audio_slice - expected_slice).max() < 1e-2 - - def test_audioldm_prompt_embeds(self): - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = audioldm_pipe(**inputs) - audio_1 = output.audios[0] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - text_inputs = audioldm_pipe.tokenizer( - prompt, - padding="max_length", - max_length=audioldm_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) - - prompt_embeds = audioldm_pipe.text_encoder( - text_inputs, - ) - prompt_embeds = prompt_embeds.text_embeds - # additional L_2 normalization over each hidden-state - prompt_embeds = F.normalize(prompt_embeds, dim=-1) - - inputs["prompt_embeds"] = prompt_embeds - - # forward - output = audioldm_pipe(**inputs) - audio_2 = output.audios[0] - - assert np.abs(audio_1 - audio_2).max() < 1e-2 - - def test_audioldm_negative_prompt_embeds(self): - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - negative_prompt = 3 * ["this is a negative prompt"] - inputs["negative_prompt"] = negative_prompt - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = audioldm_pipe(**inputs) - audio_1 = output.audios[0] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - embeds = [] - for p in [prompt, negative_prompt]: - text_inputs = audioldm_pipe.tokenizer( - p, - padding="max_length", - max_length=audioldm_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) - - text_embeds = audioldm_pipe.text_encoder( - text_inputs, - ) - text_embeds = text_embeds.text_embeds - # additional L_2 normalization over each hidden-state - text_embeds = F.normalize(text_embeds, dim=-1) - - embeds.append(text_embeds) - - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds - - # forward - output = audioldm_pipe(**inputs) - audio_2 = output.audios[0] - - assert np.abs(audio_1 - audio_2).max() < 1e-2 - - def test_audioldm_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "egg cracking" - output = audioldm_pipe(**inputs, negative_prompt=negative_prompt) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) == 256 - - audio_slice = audio[:10] - expected_slice = np.array( - [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032] - ) - - assert np.abs(audio_slice - expected_slice).max() < 1e-2 - - def test_audioldm_num_waveforms_per_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(device) - audioldm_pipe.set_progress_bar_config(disable=None) - - prompt = "A hammer hitting a wooden surface" - - # test num_waveforms_per_prompt=1 (default) - audios = audioldm_pipe(prompt, num_inference_steps=2).audios - - assert audios.shape == (1, 256) - - # test num_waveforms_per_prompt=1 (default) for batch of prompts - batch_size = 2 - audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios - - assert audios.shape == (batch_size, 256) - - # test num_waveforms_per_prompt for single prompt - num_waveforms_per_prompt = 2 - audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios - - assert audios.shape == (num_waveforms_per_prompt, 256) - - # test num_waveforms_per_prompt for batch of prompts - batch_size = 2 - audios = audioldm_pipe( - [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt - ).audios - - assert audios.shape == (batch_size * num_waveforms_per_prompt, 256) - - def test_audioldm_audio_length_in_s(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate - - inputs = self.get_dummy_inputs(device) - output = audioldm_pipe(audio_length_in_s=0.016, **inputs) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) / vocoder_sampling_rate == 0.016 - - output = audioldm_pipe(audio_length_in_s=0.032, **inputs) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) / vocoder_sampling_rate == 0.032 - - def test_audioldm_vocoder_model_in_dim(self): - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - prompt = ["hey"] - - output = audioldm_pipe(prompt, num_inference_steps=1) - audio_shape = output.audios.shape - assert audio_shape == (1, 256) - - config = audioldm_pipe.vocoder.config - config.model_in_dim *= 2 - audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device) - output = audioldm_pipe(prompt, num_inference_steps=1) - audio_shape = output.audios.shape - # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram - assert audio_shape == (1, 256) - - def test_attention_slicing_forward_pass(self): - self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical() - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False) - - -@nightly -class AudioLDMPipelineSlowTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "A hammer hitting a wooden surface", - "latents": latents, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 2.5, - } - return inputs - - def test_audioldm(self): - audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm") - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 25 - audio = audioldm_pipe(**inputs).audios[0] - - assert audio.ndim == 1 - assert len(audio) == 81920 - - audio_slice = audio[77230:77240] - expected_slice = np.array( - [-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315] - ) - max_diff = np.abs(expected_slice - audio_slice).max() - assert max_diff < 1e-2 - - -@nightly -class AudioLDMPipelineNightlyTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "A hammer hitting a wooden surface", - "latents": latents, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 2.5, - } - return inputs - - def test_audioldm_lms(self): - audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm") - audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - audio = audioldm_pipe(**inputs).audios[0] - - assert audio.ndim == 1 - assert len(audio) == 81920 - - audio_slice = audio[27780:27790] - expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212]) - max_diff = np.abs(expected_slice - audio_slice).max() - assert max_diff < 3e-2 diff --git a/tests/pipelines/blipdiffusion/__init__.py b/tests/pipelines/blipdiffusion/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/blipdiffusion/test_blipdiffusion.py b/tests/pipelines/blipdiffusion/test_blipdiffusion.py deleted file mode 100644 index 0e3f723fc6e7..000000000000 --- a/tests/pipelines/blipdiffusion/test_blipdiffusion.py +++ /dev/null @@ -1,204 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTokenizer -from transformers.models.blip_2.configuration_blip_2 import Blip2Config -from transformers.models.clip.configuration_clip import CLIPTextConfig - -from diffusers import AutoencoderKL, BlipDiffusionPipeline, PNDMScheduler, UNet2DConditionModel -from diffusers.utils.testing_utils import enable_full_determinism -from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor -from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel -from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel - -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = BlipDiffusionPipeline - params = [ - "prompt", - "reference_image", - "source_subject_category", - "target_subject_category", - ] - batch_params = [ - "prompt", - "reference_image", - "source_subject_category", - "target_subject_category", - ] - required_optional_params = [ - "generator", - "height", - "width", - "latents", - "guidance_scale", - "num_inference_steps", - "neg_prompt", - "guidance_scale", - "prompt_strength", - "prompt_reps", - ] - - supports_dduf = False - - def get_dummy_components(self): - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - vocab_size=1000, - hidden_size=8, - intermediate_size=8, - projection_dim=8, - num_hidden_layers=1, - num_attention_heads=1, - max_position_embeddings=77, - ) - text_encoder = ContextCLIPTextModel(text_encoder_config) - - vae = AutoencoderKL( - in_channels=4, - out_channels=4, - down_block_types=("DownEncoderBlock2D",), - up_block_types=("UpDecoderBlock2D",), - block_out_channels=(8,), - norm_num_groups=8, - layers_per_block=1, - act_fn="silu", - latent_channels=4, - sample_size=8, - ) - - blip_vision_config = { - "hidden_size": 8, - "intermediate_size": 8, - "num_hidden_layers": 1, - "num_attention_heads": 1, - "image_size": 224, - "patch_size": 14, - "hidden_act": "quick_gelu", - } - - blip_qformer_config = { - "vocab_size": 1000, - "hidden_size": 8, - "num_hidden_layers": 1, - "num_attention_heads": 1, - "intermediate_size": 8, - "max_position_embeddings": 512, - "cross_attention_frequency": 1, - "encoder_hidden_size": 8, - } - qformer_config = Blip2Config( - vision_config=blip_vision_config, - qformer_config=blip_qformer_config, - num_query_tokens=8, - tokenizer="hf-internal-testing/tiny-random-bert", - ) - qformer = Blip2QFormerModel(qformer_config) - - unet = UNet2DConditionModel( - block_out_channels=(8, 16), - norm_num_groups=8, - layers_per_block=1, - sample_size=16, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=8, - ) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - scheduler = PNDMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - set_alpha_to_one=False, - skip_prk_steps=True, - ) - - vae.eval() - qformer.eval() - text_encoder.eval() - - image_processor = BlipImageProcessor() - - components = { - "text_encoder": text_encoder, - "vae": vae, - "qformer": qformer, - "unet": unet, - "tokenizer": tokenizer, - "scheduler": scheduler, - "image_processor": image_processor, - } - return components - - def get_dummy_inputs(self, device, seed=0): - np.random.seed(seed) - reference_image = np.random.rand(32, 32, 3) * 255 - reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA") - - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "swimming underwater", - "generator": generator, - "reference_image": reference_image, - "source_subject_category": "dog", - "target_subject_category": "dog", - "height": 32, - "width": 32, - "guidance_scale": 7.5, - "num_inference_steps": 2, - "output_type": "np", - } - return inputs - - def test_blipdiffusion(self): - device = "cpu" - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - image = pipe(**self.get_dummy_inputs(device))[0] - image_slice = image[0, -3:, -3:, 0] - - assert image.shape == (1, 16, 16, 4) - - expected_slice = np.array( - [0.5329548, 0.8372512, 0.33269387, 0.82096875, 0.43657133, 0.3783, 0.5953028, 0.51934963, 0.42142007] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, ( - f" expected_slice {image_slice.flatten()}, but got {image_slice.flatten()}" - ) - - @unittest.skip("Test not supported because of complexities in deriving query_embeds.") - def test_encode_prompt_works_in_isolation(self): - pass diff --git a/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py b/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py deleted file mode 100644 index 100082b6f07d..000000000000 --- a/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py +++ /dev/null @@ -1,228 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTokenizer -from transformers.models.blip_2.configuration_blip_2 import Blip2Config -from transformers.models.clip.configuration_clip import CLIPTextConfig - -from diffusers import ( - AutoencoderKL, - BlipDiffusionControlNetPipeline, - ControlNetModel, - PNDMScheduler, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import enable_full_determinism, torch_device -from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor -from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel -from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel - -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class BlipDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = BlipDiffusionControlNetPipeline - params = [ - "prompt", - "reference_image", - "source_subject_category", - "target_subject_category", - "condtioning_image", - ] - batch_params = [ - "prompt", - "reference_image", - "source_subject_category", - "target_subject_category", - "condtioning_image", - ] - required_optional_params = [ - "generator", - "height", - "width", - "latents", - "guidance_scale", - "num_inference_steps", - "neg_prompt", - "guidance_scale", - "prompt_strength", - "prompt_reps", - ] - - supports_dduf = False - - def get_dummy_components(self): - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - vocab_size=1000, - hidden_size=16, - intermediate_size=16, - projection_dim=16, - num_hidden_layers=1, - num_attention_heads=1, - max_position_embeddings=77, - ) - text_encoder = ContextCLIPTextModel(text_encoder_config) - - vae = AutoencoderKL( - in_channels=4, - out_channels=4, - down_block_types=("DownEncoderBlock2D",), - up_block_types=("UpDecoderBlock2D",), - block_out_channels=(32,), - layers_per_block=1, - act_fn="silu", - latent_channels=4, - norm_num_groups=16, - sample_size=16, - ) - - blip_vision_config = { - "hidden_size": 16, - "intermediate_size": 16, - "num_hidden_layers": 1, - "num_attention_heads": 1, - "image_size": 224, - "patch_size": 14, - "hidden_act": "quick_gelu", - } - - blip_qformer_config = { - "vocab_size": 1000, - "hidden_size": 16, - "num_hidden_layers": 1, - "num_attention_heads": 1, - "intermediate_size": 16, - "max_position_embeddings": 512, - "cross_attention_frequency": 1, - "encoder_hidden_size": 16, - } - qformer_config = Blip2Config( - vision_config=blip_vision_config, - qformer_config=blip_qformer_config, - num_query_tokens=16, - tokenizer="hf-internal-testing/tiny-random-bert", - ) - qformer = Blip2QFormerModel(qformer_config) - - unet = UNet2DConditionModel( - block_out_channels=(4, 16), - layers_per_block=1, - norm_num_groups=4, - sample_size=16, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=16, - ) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - scheduler = PNDMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - set_alpha_to_one=False, - skip_prk_steps=True, - ) - controlnet = ControlNetModel( - block_out_channels=(4, 16), - layers_per_block=1, - in_channels=4, - norm_num_groups=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - cross_attention_dim=16, - conditioning_embedding_out_channels=(8, 16), - ) - - vae.eval() - qformer.eval() - text_encoder.eval() - - image_processor = BlipImageProcessor() - - components = { - "text_encoder": text_encoder, - "vae": vae, - "qformer": qformer, - "unet": unet, - "tokenizer": tokenizer, - "scheduler": scheduler, - "controlnet": controlnet, - "image_processor": image_processor, - } - return components - - def get_dummy_inputs(self, device, seed=0): - np.random.seed(seed) - reference_image = np.random.rand(32, 32, 3) * 255 - reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA") - cond_image = np.random.rand(32, 32, 3) * 255 - cond_image = Image.fromarray(cond_image.astype("uint8")).convert("RGBA") - - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "swimming underwater", - "generator": generator, - "reference_image": reference_image, - "condtioning_image": cond_image, - "source_subject_category": "dog", - "target_subject_category": "dog", - "height": 32, - "width": 32, - "guidance_scale": 7.5, - "num_inference_steps": 2, - "output_type": "np", - } - return inputs - - def test_dict_tuple_outputs_equivalent(self): - expected_slice = None - if torch_device == "cpu": - expected_slice = np.array([0.4803, 0.3865, 0.1422, 0.6119, 0.2283, 0.6365, 0.5453, 0.5205, 0.3581]) - super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) - - def test_blipdiffusion_controlnet(self): - device = "cpu" - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - image = pipe(**self.get_dummy_inputs(device))[0] - image_slice = image[0, -3:, -3:, 0] - - assert image.shape == (1, 16, 16, 4) - expected_slice = np.array([0.7953, 0.7136, 0.6597, 0.4779, 0.7389, 0.4111, 0.5826, 0.4150, 0.8422]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, ( - f" expected_slice {expected_slice}, but got {image_slice.flatten()}" - ) - - @unittest.skip("Test not supported because of complexities in deriving query_embeds.") - def test_encode_prompt_works_in_isolation(self): - pass diff --git a/tests/pipelines/controlnet_xs/__init__.py b/tests/pipelines/controlnet_xs/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs.py b/tests/pipelines/controlnet_xs/test_controlnetxs.py deleted file mode 100644 index 6f8422797cce..000000000000 --- a/tests/pipelines/controlnet_xs/test_controlnetxs.py +++ /dev/null @@ -1,352 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AsymmetricAutoencoderKL, - AutoencoderKL, - AutoencoderTiny, - ConsistencyDecoderVAE, - ControlNetXSAdapter, - DDIMScheduler, - LCMScheduler, - StableDiffusionControlNetXSPipeline, - UNet2DConditionModel, -) -from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - load_image, - require_accelerator, - require_torch_accelerator, - slow, - torch_device, -) -from diffusers.utils.torch_utils import randn_tensor - -from ...models.autoencoders.vae import ( - get_asym_autoencoder_kl_config, - get_autoencoder_kl_config, - get_autoencoder_tiny_config, - get_consistency_vae_config, -) -from ..pipeline_params import ( - IMAGE_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_BATCH_PARAMS, - TEXT_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_PARAMS, -) -from ..test_pipelines_common import ( - PipelineKarrasSchedulerTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, - SDFunctionTesterMixin, -) - - -enable_full_determinism() - - -def to_np(tensor): - if isinstance(tensor, torch.Tensor): - tensor = tensor.detach().cpu().numpy() - - return tensor - - -class ControlNetXSPipelineFastTests( - PipelineLatentTesterMixin, - PipelineKarrasSchedulerTesterMixin, - PipelineTesterMixin, - SDFunctionTesterMixin, - unittest.TestCase, -): - pipeline_class = StableDiffusionControlNetXSPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - test_attention_slicing = False - test_layerwise_casting = True - test_group_offloading = True - - def get_dummy_components(self, time_cond_proj_dim=None): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(4, 8), - layers_per_block=2, - sample_size=16, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=8, - norm_num_groups=4, - time_cond_proj_dim=time_cond_proj_dim, - use_linear_projection=True, - ) - torch.manual_seed(0) - controlnet = ControlNetXSAdapter.from_unet( - unet=unet, - size_ratio=1, - learn_time_embedding=True, - conditioning_embedding_out_channels=(2, 2), - ) - torch.manual_seed(0) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[4, 8], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - norm_num_groups=2, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=8, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "controlnet": controlnet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - controlnet_embedder_scale_factor = 2 - image = randn_tensor( - (1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor), - generator=generator, - device=torch.device(device), - ) - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - "image": image, - } - - return inputs - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=2e-3) - - def test_controlnet_lcm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components(time_cond_proj_dim=8) - sd_pipe = StableDiffusionControlNetXSPipeline(**components) - sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 16, 16, 3) - expected_slice = np.array([0.745, 0.753, 0.767, 0.543, 0.523, 0.502, 0.314, 0.521, 0.478]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_to_dtype(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - # pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components - model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] - self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) - - pipe.to(dtype=torch.float16) - model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] - self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) - - def test_multi_vae(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - block_out_channels = pipe.vae.config.block_out_channels - norm_num_groups = pipe.vae.config.norm_num_groups - - vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny] - configs = [ - get_autoencoder_kl_config(block_out_channels, norm_num_groups), - get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups), - get_consistency_vae_config(block_out_channels, norm_num_groups), - get_autoencoder_tiny_config(block_out_channels), - ] - - out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0] - - for vae_cls, config in zip(vae_classes, configs): - vae = vae_cls(**config) - vae = vae.to(torch_device) - components["vae"] = vae - vae_pipe = self.pipeline_class(**components) - - # pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device. - # So we need to move the new pipe to device. - vae_pipe.to(torch_device) - vae_pipe.set_progress_bar_config(disable=None) - - out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0] - - assert out_vae_np.shape == out_np.shape - - @require_accelerator - def test_to_device(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - pipe.to("cpu") - # pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the device from pipe.components - model_devices = [ - component.device.type for component in pipe.components.values() if hasattr(component, "device") - ] - self.assertTrue(all(device == "cpu" for device in model_devices)) - - output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0] - self.assertTrue(np.isnan(output_cpu).sum() == 0) - - pipe.to(torch_device) - model_devices = [ - component.device.type for component in pipe.components.values() if hasattr(component, "device") - ] - self.assertTrue(all(device == torch_device for device in model_devices)) - - output_device = pipe(**self.get_dummy_inputs(torch_device))[0] - self.assertTrue(np.isnan(to_np(output_device)).sum() == 0) - - def test_encode_prompt_works_in_isolation(self): - extra_required_param_value_dict = { - "device": torch.device(torch_device).type, - "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0, - } - return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) - - -@slow -@require_torch_accelerator -class ControlNetXSPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_canny(self): - controlnet = ControlNetXSAdapter.from_pretrained( - "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16 - ) - pipe = StableDiffusionControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16 - ) - pipe.enable_model_cpu_offload(device=torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (768, 512, 3) - - original_image = image[-3:, -3:, -1].flatten() - expected_image = np.array([0.1963, 0.229, 0.2659, 0.2109, 0.2332, 0.2827, 0.2534, 0.2422, 0.2808]) - assert np.allclose(original_image, expected_image, atol=1e-04) - - def test_depth(self): - controlnet = ControlNetXSAdapter.from_pretrained( - "UmerHA/Testing-ConrolNetXS-SD2.1-depth", torch_dtype=torch.float16 - ) - pipe = StableDiffusionControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16 - ) - pipe.enable_model_cpu_offload(device=torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "Stormtrooper's lecture" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (512, 512, 3) - - original_image = image[-3:, -3:, -1].flatten() - expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941]) - assert np.allclose(original_image, expected_image, atol=1e-04) diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py deleted file mode 100644 index 24a8b9cd5739..000000000000 --- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py +++ /dev/null @@ -1,393 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer - -from diffusers import ( - AsymmetricAutoencoderKL, - AutoencoderKL, - AutoencoderTiny, - ConsistencyDecoderVAE, - ControlNetXSAdapter, - EulerDiscreteScheduler, - StableDiffusionXLControlNetXSPipeline, - UNet2DConditionModel, -) -from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - load_image, - require_torch_accelerator, - slow, - torch_device, -) -from diffusers.utils.torch_utils import randn_tensor - -from ...models.autoencoders.vae import ( - get_asym_autoencoder_kl_config, - get_autoencoder_kl_config, - get_autoencoder_tiny_config, - get_consistency_vae_config, -) -from ..pipeline_params import ( - IMAGE_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_BATCH_PARAMS, - TEXT_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_PARAMS, -) -from ..test_pipelines_common import ( - PipelineKarrasSchedulerTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, -) - - -enable_full_determinism() - - -class StableDiffusionXLControlNetXSPipelineFastTests( - PipelineLatentTesterMixin, - PipelineKarrasSchedulerTesterMixin, - PipelineTesterMixin, - unittest.TestCase, -): - pipeline_class = StableDiffusionXLControlNetXSPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - test_attention_slicing = False - test_layerwise_casting = True - test_group_offloading = True - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(4, 8), - layers_per_block=2, - sample_size=16, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - use_linear_projection=True, - norm_num_groups=4, - # SD2-specific config below - attention_head_dim=(2, 4), - addition_embed_type="text_time", - addition_time_embed_dim=8, - transformer_layers_per_block=(1, 2), - projection_class_embeddings_input_dim=56, # 6 * 8 (addition_time_embed_dim) + 8 (cross_attention_dim) - cross_attention_dim=8, - ) - torch.manual_seed(0) - controlnet = ControlNetXSAdapter.from_unet( - unet=unet, - size_ratio=0.5, - learn_time_embedding=True, - conditioning_embedding_out_channels=(2, 2), - ) - torch.manual_seed(0) - scheduler = EulerDiscreteScheduler( - beta_start=0.00085, - beta_end=0.012, - steps_offset=1, - beta_schedule="scaled_linear", - timestep_spacing="leading", - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[4, 8], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - norm_num_groups=2, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=4, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=8, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config) - tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "controlnet": controlnet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "text_encoder_2": text_encoder_2, - "tokenizer_2": tokenizer_2, - "feature_extractor": None, - } - return components - - # Copied from test_controlnet_sdxl.py - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - controlnet_embedder_scale_factor = 2 - image = randn_tensor( - (1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor), - generator=generator, - device=torch.device(device), - ) - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "np", - "image": image, - } - - return inputs - - # Copied from test_controlnet_sdxl.py - def test_attention_slicing_forward_pass(self): - return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - # Copied from test_controlnet_sdxl.py - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3) - - # Copied from test_controlnet_sdxl.py - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=2e-3) - - @unittest.skip("We test this functionality elsewhere already.") - def test_save_load_optional_components(self): - pass - - @require_torch_accelerator - # Copied from test_controlnet_sdxl.py - def test_stable_diffusion_xl_offloads(self): - pipes = [] - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components).to(torch_device) - pipes.append(sd_pipe) - - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload(device=torch_device) - pipes.append(sd_pipe) - - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload(device=torch_device) - pipes.append(sd_pipe) - - image_slices = [] - for pipe in pipes: - pipe.unet.set_default_attn_processor() - - inputs = self.get_dummy_inputs(torch_device) - image = pipe(**inputs).images - - image_slices.append(image[0, -3:, -3:, -1].flatten()) - - assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3 - assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3 - - # Copied from test_controlnet_sdxl.py - def test_stable_diffusion_xl_multi_prompts(self): - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components).to(torch_device) - - # forward with single prompt - inputs = self.get_dummy_inputs(torch_device) - output = sd_pipe(**inputs) - image_slice_1 = output.images[0, -3:, -3:, -1] - - # forward with same prompt duplicated - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt_2"] = inputs["prompt"] - output = sd_pipe(**inputs) - image_slice_2 = output.images[0, -3:, -3:, -1] - - # ensure the results are equal - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 - - # forward with different prompt - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt_2"] = "different prompt" - output = sd_pipe(**inputs) - image_slice_3 = output.images[0, -3:, -3:, -1] - - # ensure the results are not equal - assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4 - - # manually set a negative_prompt - inputs = self.get_dummy_inputs(torch_device) - inputs["negative_prompt"] = "negative prompt" - output = sd_pipe(**inputs) - image_slice_1 = output.images[0, -3:, -3:, -1] - - # forward with same negative_prompt duplicated - inputs = self.get_dummy_inputs(torch_device) - inputs["negative_prompt"] = "negative prompt" - inputs["negative_prompt_2"] = inputs["negative_prompt"] - output = sd_pipe(**inputs) - image_slice_2 = output.images[0, -3:, -3:, -1] - - # ensure the results are equal - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 - - # forward with different negative_prompt - inputs = self.get_dummy_inputs(torch_device) - inputs["negative_prompt"] = "negative prompt" - inputs["negative_prompt_2"] = "different negative prompt" - output = sd_pipe(**inputs) - image_slice_3 = output.images[0, -3:, -3:, -1] - - # ensure the results are not equal - assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4 - - # Copied from test_controlnetxs.py - def test_to_dtype(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - # pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components - model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] - self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) - - pipe.to(dtype=torch.float16) - model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] - self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) - - def test_multi_vae(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - block_out_channels = pipe.vae.config.block_out_channels - norm_num_groups = pipe.vae.config.norm_num_groups - - vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny] - configs = [ - get_autoencoder_kl_config(block_out_channels, norm_num_groups), - get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups), - get_consistency_vae_config(block_out_channels, norm_num_groups), - get_autoencoder_tiny_config(block_out_channels), - ] - - out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0] - - for vae_cls, config in zip(vae_classes, configs): - vae = vae_cls(**config) - vae = vae.to(torch_device) - components["vae"] = vae - vae_pipe = self.pipeline_class(**components) - - # pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device. - # So we need to move the new pipe to device. - vae_pipe.to(torch_device) - vae_pipe.set_progress_bar_config(disable=None) - - out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0] - - assert out_vae_np.shape == out_np.shape - - -@slow -@require_torch_accelerator -class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_canny(self): - controlnet = ControlNetXSAdapter.from_pretrained( - "UmerHA/Testing-ConrolNetXS-SDXL-canny", torch_dtype=torch.float16 - ) - pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16 - ) - pipe.enable_sequential_cpu_offload(device=torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ) - - images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images - - assert images[0].shape == (768, 512, 3) - - original_image = images[0, -3:, -3:, -1].flatten() - expected_image = np.array([0.3202, 0.3151, 0.3328, 0.3172, 0.337, 0.3381, 0.3378, 0.3389, 0.3224]) - assert np.allclose(original_image, expected_image, atol=1e-04) - - def test_depth(self): - controlnet = ControlNetXSAdapter.from_pretrained( - "UmerHA/Testing-ConrolNetXS-SDXL-depth", torch_dtype=torch.float16 - ) - pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16 - ) - pipe.enable_sequential_cpu_offload(device=torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "Stormtrooper's lecture" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png" - ) - - images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images - - assert images[0].shape == (512, 512, 3) - - original_image = images[0, -3:, -3:, -1].flatten() - expected_image = np.array([0.5448, 0.5437, 0.5426, 0.5543, 0.553, 0.5475, 0.5595, 0.5602, 0.5529]) - assert np.allclose(original_image, expected_image, atol=1e-04) diff --git a/tests/pipelines/dance_diffusion/__init__.py b/tests/pipelines/dance_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py deleted file mode 100644 index a2a17532145c..000000000000 --- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py +++ /dev/null @@ -1,174 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch - -from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - nightly, - require_torch_accelerator, - skip_mps, - torch_device, -) - -from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = DanceDiffusionPipeline - params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "callback", - "latents", - "callback_steps", - "output_type", - "num_images_per_prompt", - } - batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS - test_attention_slicing = False - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet1DModel( - block_out_channels=(32, 32, 64), - extra_in_channels=16, - sample_size=512, - sample_rate=16_000, - in_channels=2, - out_channels=2, - flip_sin_to_cos=True, - use_timestep_embedding=False, - time_embedding_type="fourier", - mid_block_type="UNetMidBlock1D", - down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"), - up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), - ) - scheduler = IPNDMScheduler() - - components = { - "unet": unet, - "scheduler": scheduler, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "batch_size": 1, - "generator": generator, - "num_inference_steps": 4, - } - return inputs - - def test_dance_diffusion(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - pipe = DanceDiffusionPipeline(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = pipe(**inputs) - audio = output.audios - - audio_slice = audio[0, -3:, -3:] - - assert audio.shape == (1, 2, components["unet"].sample_size) - expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000]) - assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2 - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local() - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3) - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - @skip_mps - def test_attention_slicing_forward_pass(self): - return super().test_attention_slicing_forward_pass() - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(expected_max_diff=3e-3) - - -@nightly -@require_torch_accelerator -class PipelineIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_dance_diffusion(self): - device = torch_device - - pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k") - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096) - audio = output.audios - - audio_slice = audio[0, -3:, -3:] - - assert audio.shape == (1, 2, pipe.unet.config.sample_size) - expected_slice = np.array([-0.0192, -0.0231, -0.0318, -0.0059, 0.0002, -0.0020]) - - assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2 - - def test_dance_diffusion_fp16(self): - device = torch_device - - pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096) - audio = output.audios - - audio_slice = audio[0, -3:, -3:] - - assert audio.shape == (1, 2, pipe.unet.config.sample_size) - expected_slice = np.array([-0.0367, -0.0488, -0.0771, -0.0525, -0.0444, -0.0341]) - - assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/i2vgen_xl/__init__.py b/tests/pipelines/i2vgen_xl/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py deleted file mode 100644 index bedd63738a36..000000000000 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ /dev/null @@ -1,283 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import pytest -import torch -from transformers import ( - CLIPImageProcessor, - CLIPTextConfig, - CLIPTextModel, - CLIPTokenizer, - CLIPVisionConfig, - CLIPVisionModelWithProjection, -) - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - I2VGenXLPipeline, -) -from diffusers.models.unets import I2VGenXLUNet -from diffusers.utils import is_xformers_available, load_image -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - floats_tensor, - is_torch_version, - numpy_cosine_similarity_distance, - require_torch_accelerator, - skip_mps, - slow, - torch_device, -) - -from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin - - -enable_full_determinism() - - -@skip_mps -class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unittest.TestCase): - pipeline_class = I2VGenXLPipeline - params = frozenset(["prompt", "negative_prompt", "image"]) - batch_params = frozenset(["prompt", "negative_prompt", "image", "generator"]) - # No `output_type`. - required_optional_params = frozenset(["num_inference_steps", "generator", "latents", "return_dict"]) - - supports_dduf = False - test_layerwise_casting = True - - def get_dummy_components(self): - torch.manual_seed(0) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - - torch.manual_seed(0) - unet = I2VGenXLUNet( - block_out_channels=(4, 8), - layers_per_block=1, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"), - up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"), - cross_attention_dim=4, - attention_head_dim=4, - num_attention_heads=None, - norm_num_groups=2, - ) - - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=(8,), - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D"], - latent_channels=4, - sample_size=32, - norm_num_groups=2, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=4, - intermediate_size=16, - layer_norm_eps=1e-05, - num_attention_heads=2, - num_hidden_layers=2, - pad_token_id=1, - vocab_size=1000, - hidden_act="gelu", - projection_dim=32, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - torch.manual_seed(0) - vision_encoder_config = CLIPVisionConfig( - hidden_size=4, - projection_dim=4, - num_hidden_layers=2, - num_attention_heads=2, - image_size=32, - intermediate_size=16, - patch_size=1, - ) - image_encoder = CLIPVisionModelWithProjection(vision_encoder_config) - - torch.manual_seed(0) - feature_extractor = CLIPImageProcessor(crop_size=32, size=32) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "image_encoder": image_encoder, - "tokenizer": tokenizer, - "feature_extractor": feature_extractor, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "image": input_image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "pt", - "num_frames": 4, - "width": 32, - "height": 32, - } - return inputs - - def test_text_to_video_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["output_type"] = "np" - frames = pipe(**inputs).frames - - image_slice = frames[0][0][-3:, -3:, -1] - - assert frames[0][0].shape == (32, 32, 3) - expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - @pytest.mark.xfail( - condition=is_torch_version(">=", "2.7"), - reason="Test currently fails on PyTorch 2.7.", - strict=False, - ) - def test_save_load_local(self): - super().test_save_load_local(expected_max_difference=0.006) - - def test_sequential_cpu_offload_forward_pass(self): - super().test_sequential_cpu_offload_forward_pass(expected_max_diff=0.008) - - def test_dict_tuple_outputs_equivalent(self): - super().test_dict_tuple_outputs_equivalent(expected_max_difference=0.009) - - def test_save_load_optional_components(self): - super().test_save_load_optional_components(expected_max_difference=0.008) - - @unittest.skip("Deprecated functionality") - def test_attention_slicing_forward_pass(self): - pass - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2) - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=0.008) - - def test_model_cpu_offload_forward_pass(self): - super().test_model_cpu_offload_forward_pass(expected_max_diff=0.008) - - def test_num_videos_per_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["output_type"] = "np" - frames = pipe(**inputs, num_videos_per_prompt=2).frames - - assert frames.shape == (2, 4, 32, 32, 3) - assert frames[0][0].shape == (32, 32, 3) - - image_slice = frames[0][0][-3:, -3:, -1] - expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - @unittest.skip("Test not supported for now.") - def test_encode_prompt_works_in_isolation(self): - pass - - -@slow -@require_torch_accelerator -class I2VGenXLPipelineSlowTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_i2vgen_xl(self): - pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") - pipe.enable_model_cpu_offload(device=torch_device) - pipe.set_progress_bar_config(disable=None) - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true" - ) - - generator = torch.Generator("cpu").manual_seed(0) - num_frames = 3 - - output = pipe( - image=image, - prompt="my cat", - num_frames=num_frames, - generator=generator, - num_inference_steps=3, - output_type="np", - ) - - image = output.frames[0] - assert image.shape == (num_frames, 704, 1280, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.5482, 0.6244, 0.6274, 0.4584, 0.5935, 0.5937, 0.4579, 0.5767, 0.5892]) - assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3 diff --git a/tests/pipelines/musicldm/__init__.py b/tests/pipelines/musicldm/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/musicldm/test_musicldm.py b/tests/pipelines/musicldm/test_musicldm.py deleted file mode 100644 index 5d6392865bc8..000000000000 --- a/tests/pipelines/musicldm/test_musicldm.py +++ /dev/null @@ -1,478 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import gc -import unittest - -import numpy as np -import torch -from transformers import ( - ClapAudioConfig, - ClapConfig, - ClapFeatureExtractor, - ClapModel, - ClapTextConfig, - RobertaTokenizer, - SpeechT5HifiGan, - SpeechT5HifiGanConfig, -) - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - LMSDiscreteScheduler, - MusicLDMPipeline, - PNDMScheduler, - UNet2DConditionModel, -) -from diffusers.utils import is_xformers_available -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - nightly, - require_torch_accelerator, - torch_device, -) - -from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = MusicLDMPipeline - params = TEXT_TO_AUDIO_PARAMS - batch_params = TEXT_TO_AUDIO_BATCH_PARAMS - required_optional_params = frozenset( - [ - "num_inference_steps", - "num_waveforms_per_prompt", - "generator", - "latents", - "output_type", - "return_dict", - "callback", - "callback_steps", - ] - ) - - supports_dduf = False - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=(32, 64), - class_embed_type="simple_projection", - projection_class_embeddings_input_dim=32, - class_embeddings_concat=True, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=1, - out_channels=1, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_branch_config = ClapTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=16, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=2, - num_hidden_layers=2, - pad_token_id=1, - vocab_size=1000, - ) - audio_branch_config = ClapAudioConfig( - spec_size=64, - window_size=4, - num_mel_bins=64, - intermediate_size=37, - layer_norm_eps=1e-05, - depths=[2, 2], - num_attention_heads=[2, 2], - num_hidden_layers=2, - hidden_size=192, - patch_size=2, - patch_stride=2, - patch_embed_input_channels=4, - ) - text_encoder_config = ClapConfig.from_text_audio_configs( - text_config=text_branch_config, audio_config=audio_branch_config, projection_dim=32 - ) - text_encoder = ClapModel(text_encoder_config) - tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77) - feature_extractor = ClapFeatureExtractor.from_pretrained( - "hf-internal-testing/tiny-random-ClapModel", hop_length=7900 - ) - - torch.manual_seed(0) - vocoder_config = SpeechT5HifiGanConfig( - model_in_dim=8, - sampling_rate=16000, - upsample_initial_channel=16, - upsample_rates=[2, 2], - upsample_kernel_sizes=[4, 4], - resblock_kernel_sizes=[3, 7], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]], - normalize_before=False, - ) - - vocoder = SpeechT5HifiGan(vocoder_config) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "feature_extractor": feature_extractor, - "vocoder": vocoder, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A hammer hitting a wooden surface", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - } - return inputs - - def test_musicldm_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - musicldm_pipe = MusicLDMPipeline(**components) - musicldm_pipe = musicldm_pipe.to(torch_device) - musicldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = musicldm_pipe(**inputs) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) == 256 - - audio_slice = audio[:10] - expected_slice = np.array( - [-0.0027, -0.0036, -0.0037, -0.0020, -0.0035, -0.0019, -0.0037, -0.0020, -0.0038, -0.0019] - ) - - assert np.abs(audio_slice - expected_slice).max() < 1e-4 - - def test_musicldm_prompt_embeds(self): - components = self.get_dummy_components() - musicldm_pipe = MusicLDMPipeline(**components) - musicldm_pipe = musicldm_pipe.to(torch_device) - musicldm_pipe = musicldm_pipe.to(torch_device) - musicldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = musicldm_pipe(**inputs) - audio_1 = output.audios[0] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - text_inputs = musicldm_pipe.tokenizer( - prompt, - padding="max_length", - max_length=musicldm_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) - - prompt_embeds = musicldm_pipe.text_encoder.get_text_features(text_inputs) - - inputs["prompt_embeds"] = prompt_embeds - - # forward - output = musicldm_pipe(**inputs) - audio_2 = output.audios[0] - - assert np.abs(audio_1 - audio_2).max() < 1e-2 - - def test_musicldm_negative_prompt_embeds(self): - components = self.get_dummy_components() - musicldm_pipe = MusicLDMPipeline(**components) - musicldm_pipe = musicldm_pipe.to(torch_device) - musicldm_pipe = musicldm_pipe.to(torch_device) - musicldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - negative_prompt = 3 * ["this is a negative prompt"] - inputs["negative_prompt"] = negative_prompt - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = musicldm_pipe(**inputs) - audio_1 = output.audios[0] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - embeds = [] - for p in [prompt, negative_prompt]: - text_inputs = musicldm_pipe.tokenizer( - p, - padding="max_length", - max_length=musicldm_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) - - text_embeds = musicldm_pipe.text_encoder.get_text_features( - text_inputs, - ) - embeds.append(text_embeds) - - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds - - # forward - output = musicldm_pipe(**inputs) - audio_2 = output.audios[0] - - assert np.abs(audio_1 - audio_2).max() < 1e-2 - - def test_musicldm_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - musicldm_pipe = MusicLDMPipeline(**components) - musicldm_pipe = musicldm_pipe.to(device) - musicldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "egg cracking" - output = musicldm_pipe(**inputs, negative_prompt=negative_prompt) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) == 256 - - audio_slice = audio[:10] - expected_slice = np.array( - [-0.0027, -0.0036, -0.0037, -0.0019, -0.0035, -0.0018, -0.0037, -0.0021, -0.0038, -0.0018] - ) - - assert np.abs(audio_slice - expected_slice).max() < 1e-4 - - def test_musicldm_num_waveforms_per_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - musicldm_pipe = MusicLDMPipeline(**components) - musicldm_pipe = musicldm_pipe.to(device) - musicldm_pipe.set_progress_bar_config(disable=None) - - prompt = "A hammer hitting a wooden surface" - - # test num_waveforms_per_prompt=1 (default) - audios = musicldm_pipe(prompt, num_inference_steps=2).audios - - assert audios.shape == (1, 256) - - # test num_waveforms_per_prompt=1 (default) for batch of prompts - batch_size = 2 - audios = musicldm_pipe([prompt] * batch_size, num_inference_steps=2).audios - - assert audios.shape == (batch_size, 256) - - # test num_waveforms_per_prompt for single prompt - num_waveforms_per_prompt = 2 - audios = musicldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios - - assert audios.shape == (num_waveforms_per_prompt, 256) - - # test num_waveforms_per_prompt for batch of prompts - batch_size = 2 - audios = musicldm_pipe( - [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt - ).audios - - assert audios.shape == (batch_size * num_waveforms_per_prompt, 256) - - def test_musicldm_audio_length_in_s(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - musicldm_pipe = MusicLDMPipeline(**components) - musicldm_pipe = musicldm_pipe.to(torch_device) - musicldm_pipe.set_progress_bar_config(disable=None) - vocoder_sampling_rate = musicldm_pipe.vocoder.config.sampling_rate - - inputs = self.get_dummy_inputs(device) - output = musicldm_pipe(audio_length_in_s=0.016, **inputs) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) / vocoder_sampling_rate == 0.016 - - output = musicldm_pipe(audio_length_in_s=0.032, **inputs) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) / vocoder_sampling_rate == 0.032 - - def test_musicldm_vocoder_model_in_dim(self): - components = self.get_dummy_components() - musicldm_pipe = MusicLDMPipeline(**components) - musicldm_pipe = musicldm_pipe.to(torch_device) - musicldm_pipe.set_progress_bar_config(disable=None) - - prompt = ["hey"] - - output = musicldm_pipe(prompt, num_inference_steps=1) - audio_shape = output.audios.shape - assert audio_shape == (1, 256) - - config = musicldm_pipe.vocoder.config - config.model_in_dim *= 2 - musicldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device) - output = musicldm_pipe(prompt, num_inference_steps=1) - audio_shape = output.audios.shape - # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram - assert audio_shape == (1, 256) - - def test_attention_slicing_forward_pass(self): - self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical() - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False) - - def test_to_dtype(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - # The method component.dtype returns the dtype of the first parameter registered in the model, not the - # dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale) - model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} - - # Without the logit scale parameters, everything is float32 - model_dtypes.pop("text_encoder") - self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values())) - - # the CLAP sub-models are float32 - model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype - self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values())) - - # Once we send to fp16, all params are in half-precision, including the logit scale - pipe.to(dtype=torch.float16) - model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} - self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values())) - - -@nightly -@require_torch_accelerator -class MusicLDMPipelineNightlyTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "A hammer hitting a wooden surface", - "latents": latents, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 2.5, - } - return inputs - - def test_musicldm(self): - musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm") - musicldm_pipe = musicldm_pipe.to(torch_device) - musicldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 25 - audio = musicldm_pipe(**inputs).audios[0] - - assert audio.ndim == 1 - assert len(audio) == 81952 - - # check the portion of the generated audio with the largest dynamic range (reduces flakiness) - audio_slice = audio[8680:8690] - expected_slice = np.array( - [-0.1042, -0.1068, -0.1235, -0.1387, -0.1428, -0.136, -0.1213, -0.1097, -0.0967, -0.0945] - ) - max_diff = np.abs(expected_slice - audio_slice).max() - assert max_diff < 1e-3 - - def test_musicldm_lms(self): - musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm") - musicldm_pipe.scheduler = LMSDiscreteScheduler.from_config(musicldm_pipe.scheduler.config) - musicldm_pipe = musicldm_pipe.to(torch_device) - musicldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - audio = musicldm_pipe(**inputs).audios[0] - - assert audio.ndim == 1 - assert len(audio) == 81952 - - # check the portion of the generated audio with the largest dynamic range (reduces flakiness) - audio_slice = audio[58020:58030] - expected_slice = np.array([0.3592, 0.3477, 0.4084, 0.4665, 0.5048, 0.5891, 0.6461, 0.5579, 0.4595, 0.4403]) - max_diff = np.abs(expected_slice - audio_slice).max() - assert max_diff < 1e-3 diff --git a/tests/pipelines/paint_by_example/__init__.py b/tests/pipelines/paint_by_example/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py deleted file mode 100644 index f122c7411d48..000000000000 --- a/tests/pipelines/paint_by_example/test_paint_by_example.py +++ /dev/null @@ -1,229 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPImageProcessor, CLIPVisionConfig - -from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel -from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - floats_tensor, - load_image, - nightly, - require_torch_accelerator, - torch_device, -) - -from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = PaintByExamplePipeline - params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS - batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - image_params = frozenset([]) # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess - - supports_dduf = False - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=9, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = PNDMScheduler(skip_prk_steps=True) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - config = CLIPVisionConfig( - hidden_size=32, - projection_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - image_size=32, - patch_size=4, - ) - image_encoder = PaintByExampleImageEncoder(config, proj_size=32) - feature_extractor = CLIPImageProcessor(crop_size=32, size=32) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "image_encoder": image_encoder, - "safety_checker": None, - "feature_extractor": feature_extractor, - } - return components - - def convert_to_pt(self, image): - image = np.array(image.convert("RGB")) - image = image[None].transpose(0, 3, 1, 2) - image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 - return image - - def get_dummy_inputs(self, device="cpu", seed=0): - # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64)) - example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32)) - - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "example_image": example_image, - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "np", - } - return inputs - - def test_paint_by_example_inpaint(self): - components = self.get_dummy_components() - - # make sure here that pndm scheduler skips prk - pipe = PaintByExamplePipeline(**components) - pipe = pipe.to("cpu") - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - output = pipe(**inputs) - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4686, 0.5687, 0.4007, 0.5218, 0.5741, 0.4482, 0.4940, 0.4629, 0.4503]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_paint_by_example_image_tensor(self): - device = "cpu" - inputs = self.get_dummy_inputs() - inputs.pop("mask_image") - image = self.convert_to_pt(inputs.pop("image")) - mask_image = image.clamp(0, 1) / 2 - - # make sure here that pndm scheduler skips prk - pipe = PaintByExamplePipeline(**self.get_dummy_components()) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - output = pipe(image=image, mask_image=mask_image[:, 0], **inputs) - out_1 = output.images - - image = image.cpu().permute(0, 2, 3, 1)[0] - mask_image = mask_image.cpu().permute(0, 2, 3, 1)[0] - - image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB") - - output = pipe(**self.get_dummy_inputs()) - out_2 = output.images - - assert out_1.shape == (1, 64, 64, 3) - assert np.abs(out_1.flatten() - out_2.flatten()).max() < 5e-2 - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(expected_max_diff=3e-3) - - -@nightly -@require_torch_accelerator -class PaintByExamplePipelineIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_paint_by_example(self): - # make sure here that pndm scheduler skips prk - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/paint_by_example/dog_in_bucket.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/paint_by_example/mask.png" - ) - example_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/paint_by_example/panda.jpg" - ) - - pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example") - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(321) - output = pipe( - image=init_image, - mask_image=mask_image, - example_image=example_image, - generator=generator, - guidance_scale=5.0, - num_inference_steps=50, - output_type="np", - ) - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.5290, 0.5374]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/pia/__init__.py b/tests/pipelines/pia/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/pia/test_pia.py b/tests/pipelines/pia/test_pia.py deleted file mode 100644 index 1156bf32dafa..000000000000 --- a/tests/pipelines/pia/test_pia.py +++ /dev/null @@ -1,448 +0,0 @@ -import random -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -import diffusers -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - LCMScheduler, - MotionAdapter, - PIAPipeline, - StableDiffusionPipeline, - UNet2DConditionModel, - UNetMotionModel, -) -from diffusers.utils import is_xformers_available, logging -from diffusers.utils.testing_utils import floats_tensor, require_accelerator, torch_device - -from ..test_pipelines_common import IPAdapterTesterMixin, PipelineFromPipeTesterMixin, PipelineTesterMixin - - -def to_np(tensor): - if isinstance(tensor, torch.Tensor): - tensor = tensor.detach().cpu().numpy() - - return tensor - - -class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase): - pipeline_class = PIAPipeline - params = frozenset( - [ - "prompt", - "height", - "width", - "guidance_scale", - "negative_prompt", - "prompt_embeds", - "negative_prompt_embeds", - "cross_attention_kwargs", - ] - ) - batch_params = frozenset(["prompt", "image", "generator"]) - required_optional_params = frozenset( - [ - "num_inference_steps", - "generator", - "latents", - "return_dict", - "callback_on_step_end", - "callback_on_step_end_tensor_inputs", - ] - ) - test_layerwise_casting = True - test_group_offloading = True - - def get_dummy_components(self): - cross_attention_dim = 8 - block_out_channels = (8, 8) - - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=block_out_channels, - layers_per_block=2, - sample_size=8, - in_channels=4, - out_channels=4, - down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=cross_attention_dim, - norm_num_groups=2, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="linear", - clip_sample=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=block_out_channels, - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - norm_num_groups=2, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=cross_attention_dim, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - torch.manual_seed(0) - motion_adapter = MotionAdapter( - block_out_channels=block_out_channels, - motion_layers_per_block=2, - motion_norm_num_groups=2, - motion_num_attention_heads=4, - conv_in_channels=9, - ) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "motion_adapter": motion_adapter, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "feature_extractor": None, - "image_encoder": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - image = floats_tensor((1, 3, 8, 8), rng=random.Random(seed)).to(device) - inputs = { - "image": image, - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 7.5, - "output_type": "pt", - } - return inputs - - def test_from_pipe_consistent_config(self): - assert self.original_pipeline_class == StableDiffusionPipeline - original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe" - original_kwargs = {"requires_safety_checker": False} - - # create original_pipeline_class(sd) - pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs) - - # original_pipeline_class(sd) -> pipeline_class - pipe_components = self.get_dummy_components() - pipe_additional_components = {} - for name, component in pipe_components.items(): - if name not in pipe_original.components: - pipe_additional_components[name] = component - - pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components) - - # pipeline_class -> original_pipeline_class(sd) - original_pipe_additional_components = {} - for name, component in pipe_original.components.items(): - if name not in pipe.components or not isinstance(component, pipe.components[name].__class__): - original_pipe_additional_components[name] = component - - pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components) - - # compare the config - original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")} - original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")} - assert original_config_2 == original_config - - def test_motion_unet_loading(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - - assert isinstance(pipe.unet, UNetMotionModel) - - def test_ip_adapter(self): - expected_pipe_slice = None - - if torch_device == "cpu": - expected_pipe_slice = np.array( - [ - 0.5475, - 0.5769, - 0.4873, - 0.5064, - 0.4445, - 0.5876, - 0.5453, - 0.4102, - 0.5247, - 0.5370, - 0.3406, - 0.4322, - 0.3991, - 0.3756, - 0.5438, - 0.4780, - 0.5087, - 0.5248, - 0.6243, - 0.5506, - 0.3491, - 0.5440, - 0.6111, - 0.5122, - 0.5326, - 0.5180, - 0.5538, - ] - ) - return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice) - - def test_dict_tuple_outputs_equivalent(self): - expected_slice = None - if torch_device == "cpu": - expected_slice = np.array([0.5476, 0.4092, 0.5289, 0.4755, 0.5092, 0.5186, 0.5403, 0.5287, 0.5467]) - return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) - - @unittest.skip("Attention slicing is not enabled in this pipeline") - def test_attention_slicing_forward_pass(self): - pass - - def test_inference_batch_single_identical( - self, - batch_size=2, - expected_max_diff=1e-4, - additional_params_copy_to_batched_inputs=["num_inference_steps"], - ): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - for components in pipe.components.values(): - if hasattr(components, "set_default_attn_processor"): - components.set_default_attn_processor() - - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - inputs = self.get_dummy_inputs(torch_device) - # Reset generator in case it is has been used in self.get_dummy_inputs - inputs["generator"] = self.get_generator(0) - - logger = logging.get_logger(pipe.__module__) - logger.setLevel(level=diffusers.logging.FATAL) - - # batchify inputs - batched_inputs = {} - batched_inputs.update(inputs) - - for name in self.batch_params: - if name not in inputs: - continue - - value = inputs[name] - if name == "prompt": - len_prompt = len(value) - batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)] - batched_inputs[name][-1] = 100 * "very long" - - else: - batched_inputs[name] = batch_size * [value] - - if "generator" in inputs: - batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)] - - if "batch_size" in inputs: - batched_inputs["batch_size"] = batch_size - - for arg in additional_params_copy_to_batched_inputs: - batched_inputs[arg] = inputs[arg] - - output = pipe(**inputs) - output_batch = pipe(**batched_inputs) - - assert output_batch[0].shape[0] == batch_size - - max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max() - assert max_diff < expected_max_diff - - @require_accelerator - def test_to_device(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - pipe.to("cpu") - # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components - model_devices = [ - component.device.type for component in pipe.components.values() if hasattr(component, "device") - ] - self.assertTrue(all(device == "cpu" for device in model_devices)) - - output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0] - self.assertTrue(np.isnan(output_cpu).sum() == 0) - - pipe.to(torch_device) - model_devices = [ - component.device.type for component in pipe.components.values() if hasattr(component, "device") - ] - self.assertTrue(all(device == torch_device for device in model_devices)) - - output_device = pipe(**self.get_dummy_inputs(torch_device))[0] - self.assertTrue(np.isnan(to_np(output_device)).sum() == 0) - - def test_to_dtype(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components - model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] - self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) - - pipe.to(dtype=torch.float16) - model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] - self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) - - def test_prompt_embeds(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - pipe.to(torch_device) - - inputs = self.get_dummy_inputs(torch_device) - inputs.pop("prompt") - inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device) - pipe(**inputs) - - def test_free_init(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - pipe.to(torch_device) - - inputs_normal = self.get_dummy_inputs(torch_device) - frames_normal = pipe(**inputs_normal).frames[0] - - pipe.enable_free_init( - num_iters=2, - use_fast_sampling=True, - method="butterworth", - order=4, - spatial_stop_frequency=0.25, - temporal_stop_frequency=0.25, - ) - inputs_enable_free_init = self.get_dummy_inputs(torch_device) - frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0] - - pipe.disable_free_init() - inputs_disable_free_init = self.get_dummy_inputs(torch_device) - frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0] - - sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum() - max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max() - self.assertGreater( - sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results" - ) - self.assertLess( - max_diff_disabled, - 1e-4, - "Disabling of FreeInit should lead to results similar to the default pipeline results", - ) - - def test_free_init_with_schedulers(self): - components = self.get_dummy_components() - pipe: PIAPipeline = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - pipe.to(torch_device) - - inputs_normal = self.get_dummy_inputs(torch_device) - frames_normal = pipe(**inputs_normal).frames[0] - - schedulers_to_test = [ - DPMSolverMultistepScheduler.from_config( - components["scheduler"].config, - timestep_spacing="linspace", - beta_schedule="linear", - algorithm_type="dpmsolver++", - steps_offset=1, - clip_sample=False, - ), - LCMScheduler.from_config( - components["scheduler"].config, - timestep_spacing="linspace", - beta_schedule="linear", - steps_offset=1, - clip_sample=False, - ), - ] - components.pop("scheduler") - - for scheduler in schedulers_to_test: - components["scheduler"] = scheduler - pipe: PIAPipeline = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - pipe.to(torch_device) - - pipe.enable_free_init(num_iters=2, use_fast_sampling=False) - - inputs = self.get_dummy_inputs(torch_device) - frames_enable_free_init = pipe(**inputs).frames[0] - sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum() - - self.assertGreater( - sum_enabled, - 1e1, - "Enabling of FreeInit should lead to results different from the default pipeline results", - ) - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - for component in pipe.components.values(): - if hasattr(component, "set_default_attn_processor"): - component.set_default_attn_processor() - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_without_offload = pipe(**inputs).frames[0] - output_without_offload = ( - output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload - ) - - pipe.enable_xformers_memory_efficient_attention() - inputs = self.get_dummy_inputs(torch_device) - output_with_offload = pipe(**inputs).frames[0] - output_with_offload = ( - output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload - ) - - max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max() - self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results") - - def test_encode_prompt_works_in_isolation(self): - extra_required_param_value_dict = { - "device": torch.device(torch_device).type, - "num_images_per_prompt": 1, - "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0, - } - return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) diff --git a/tests/pipelines/semantic_stable_diffusion/__init__.py b/tests/pipelines/semantic_stable_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py deleted file mode 100644 index b4d82b0fb2a8..000000000000 --- a/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py +++ /dev/null @@ -1,617 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import tempfile -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel -from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - floats_tensor, - nightly, - require_torch_accelerator, - torch_device, -) - - -enable_full_determinism() - - -class SafeDiffusionPipelineFastTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_cond_unet(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - @property - def dummy_extractor(self): - def extract(*args, **kwargs): - class Out: - def __init__(self): - self.pixel_values = torch.ones([0]) - - def to(self, device): - self.pixel_values.to(device) - return self - - return Out() - - return extract - - def test_semantic_diffusion_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5753, 0.6114, 0.5001, 0.5034, 0.5470, 0.4729, 0.4971, 0.4867, 0.4867]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_semantic_diffusion_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5122, 0.5712, 0.4825, 0.5053, 0.5646, 0.4769, 0.5179, 0.4894, 0.4994]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_semantic_diffusion_no_safety_checker(self): - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None - ) - assert isinstance(pipe, StableDiffusionPipeline) - assert isinstance(pipe.scheduler, LMSDiscreteScheduler) - assert pipe.safety_checker is None - - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - # check that there's no error when saving a pipeline with one of the models being None - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe = StableDiffusionPipeline.from_pretrained(tmpdirname) - - # sanity check that the pipeline still works - assert pipe.safety_checker is None - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - @require_torch_accelerator - def test_semantic_diffusion_fp16(self): - """Test that stable diffusion works with fp16""" - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # put models in fp16 - unet = unet.half() - vae = vae.half() - bert = bert.half() - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images - - assert image.shape == (1, 64, 64, 3) - - -@nightly -@require_torch_accelerator -class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_positive_guidance(self): - pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "a photo of a cat" - edit = { - "editing_prompt": ["sunglasses"], - "reverse_editing_direction": [False], - "edit_warmup_steps": 10, - "edit_guidance_scale": 6, - "edit_threshold": 0.95, - "edit_momentum_scale": 0.5, - "edit_mom_beta": 0.6, - } - - seed = 3 - guidance_scale = 7 - - # no sega enabled - generator = torch.Generator(torch_device) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.34673113, - 0.38492733, - 0.37597352, - 0.34086335, - 0.35650748, - 0.35579205, - 0.3384763, - 0.34340236, - 0.3573271, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # with sega enabled - # generator = torch.manual_seed(seed) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - **edit, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.41887826, - 0.37728766, - 0.30138272, - 0.41416335, - 0.41664985, - 0.36283392, - 0.36191246, - 0.43364465, - 0.43001732, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_negative_guidance(self): - pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "an image of a crowded boulevard, realistic, 4k" - edit = { - "editing_prompt": "crowd, crowded, people", - "reverse_editing_direction": True, - "edit_warmup_steps": 10, - "edit_guidance_scale": 8.3, - "edit_threshold": 0.9, - "edit_momentum_scale": 0.5, - "edit_mom_beta": 0.6, - } - - seed = 9 - guidance_scale = 7 - - # no sega enabled - generator = torch.Generator(torch_device) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.43497998, - 0.91814065, - 0.7540739, - 0.55580205, - 0.8467265, - 0.5389691, - 0.62574506, - 0.58897763, - 0.50926757, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # with sega enabled - # generator = torch.manual_seed(seed) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - **edit, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.3089719, - 0.30500144, - 0.29016042, - 0.30630964, - 0.325687, - 0.29419225, - 0.2908091, - 0.28723598, - 0.27696294, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_multi_cond_guidance(self): - pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "a castle next to a river" - edit = { - "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"], - "reverse_editing_direction": False, - "edit_warmup_steps": [15, 18], - "edit_guidance_scale": 6, - "edit_threshold": [0.9, 0.8], - "edit_momentum_scale": 0.5, - "edit_mom_beta": 0.6, - } - - seed = 48 - guidance_scale = 7 - - # no sega enabled - generator = torch.Generator(torch_device) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.75163555, - 0.76037145, - 0.61785, - 0.9189673, - 0.8627701, - 0.85189694, - 0.8512813, - 0.87012076, - 0.8312857, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # with sega enabled - # generator = torch.manual_seed(seed) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - **edit, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.73553365, - 0.7537271, - 0.74341905, - 0.66480356, - 0.6472925, - 0.63039416, - 0.64812905, - 0.6749717, - 0.6517102, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_guidance_fp16(self): - pipe = StableDiffusionPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "a photo of a cat" - edit = { - "editing_prompt": ["sunglasses"], - "reverse_editing_direction": [False], - "edit_warmup_steps": 10, - "edit_guidance_scale": 6, - "edit_threshold": 0.95, - "edit_momentum_scale": 0.5, - "edit_mom_beta": 0.6, - } - - seed = 3 - guidance_scale = 7 - - # no sega enabled - generator = torch.Generator(torch_device) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.34887695, - 0.3876953, - 0.375, - 0.34423828, - 0.3581543, - 0.35717773, - 0.3383789, - 0.34570312, - 0.359375, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # with sega enabled - # generator = torch.manual_seed(seed) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - **edit, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.42285156, - 0.36914062, - 0.29077148, - 0.42041016, - 0.41918945, - 0.35498047, - 0.3618164, - 0.4423828, - 0.43115234, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py deleted file mode 100644 index 45fc70be2300..000000000000 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ /dev/null @@ -1,267 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - StableDiffusionAttendAndExcitePipeline, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import ( - backend_empty_cache, - load_numpy, - nightly, - numpy_cosine_similarity_distance, - require_torch_accelerator, - skip_mps, - torch_device, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import ( - PipelineFromPipeTesterMixin, - PipelineKarrasSchedulerTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, -) - - -torch.backends.cuda.matmul.allow_tf32 = False - - -@skip_mps -class StableDiffusionAttendAndExcitePipelineFastTests( - PipelineLatentTesterMixin, - PipelineKarrasSchedulerTesterMixin, - PipelineTesterMixin, - PipelineFromPipeTesterMixin, - unittest.TestCase, -): - pipeline_class = StableDiffusionAttendAndExcitePipeline - test_attention_slicing = False - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"}) - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - # Attend and excite requires being able to run a backward pass at - # inference time. There's no deterministic backward operator for pad - - @classmethod - def setUpClass(cls): - super().setUpClass() - torch.use_deterministic_algorithms(False) - - @classmethod - def tearDownClass(cls): - super().tearDownClass() - torch.use_deterministic_algorithms(True) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=1, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - # SD2-specific config below - attention_head_dim=(2, 4), - use_linear_projection=True, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=512, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "a cat and a frog", - "token_indices": [2, 5], - "generator": generator, - "num_inference_steps": 1, - "guidance_scale": 6.0, - "output_type": "np", - "max_iter_to_alter": 2, - "thresholds": {0: 0.7}, - } - return inputs - - def test_dict_tuple_outputs_equivalent(self): - expected_slice = None - if torch_device == "cpu": - expected_slice = np.array([0.6391, 0.6290, 0.4860, 0.5134, 0.5550, 0.4577, 0.5033, 0.5023, 0.4538]) - super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice, expected_max_difference=3e-3) - - def test_inference(self): - device = "cpu" - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - self.assertEqual(image.shape, (1, 64, 64, 3)) - expected_slice = np.array( - [0.63905364, 0.62897307, 0.48599017, 0.5133624, 0.5550048, 0.45769516, 0.50326973, 0.5023139, 0.45384496] - ) - max_diff = np.abs(image_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) - - def test_sequential_cpu_offload_forward_pass(self): - super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4) - - def test_inference_batch_consistent(self): - # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches - self._test_inference_batch_consistent(batch_sizes=[1, 2]) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=7e-4) - - def test_pt_np_pil_outputs_equivalent(self): - super().test_pt_np_pil_outputs_equivalent(expected_max_diff=5e-4) - - def test_save_load_local(self): - super().test_save_load_local(expected_max_difference=5e-4) - - def test_save_load_optional_components(self): - super().test_save_load_optional_components(expected_max_difference=4e-4) - - def test_karras_schedulers_shape(self): - super().test_karras_schedulers_shape(num_inference_steps_for_strength_for_iterations=3) - - def test_from_pipe_consistent_forward_pass_cpu_offload(self): - super().test_from_pipe_consistent_forward_pass_cpu_offload(expected_max_diff=5e-3) - - def test_encode_prompt_works_in_isolation(self): - extra_required_param_value_dict = { - "device": torch.device(torch_device).type, - "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0, - } - return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) - - -@require_torch_accelerator -@nightly -class StableDiffusionAttendAndExcitePipelineIntegrationTests(unittest.TestCase): - # Attend and excite requires being able to run a backward pass at - # inference time. There's no deterministic backward operator for pad - - @classmethod - def setUpClass(cls): - super().setUpClass() - torch.use_deterministic_algorithms(False) - - @classmethod - def tearDownClass(cls): - super().tearDownClass() - torch.use_deterministic_algorithms(True) - - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_attend_and_excite_fp16(self): - generator = torch.manual_seed(51) - - pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.to(torch_device) - - prompt = "a painting of an elephant with glasses" - token_indices = [5, 7] - - image = pipe( - prompt=prompt, - token_indices=token_indices, - guidance_scale=7.5, - generator=generator, - num_inference_steps=5, - max_iter_to_alter=5, - output_type="np", - ).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy" - ) - max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) - assert max_diff < 5e-1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py deleted file mode 100644 index 9f8870af7b16..000000000000 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py +++ /dev/null @@ -1,452 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import tempfile -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMInverseScheduler, - DDIMScheduler, - DPMSolverMultistepInverseScheduler, - DPMSolverMultistepScheduler, - StableDiffusionDiffEditPipeline, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - floats_tensor, - load_image, - nightly, - numpy_cosine_similarity_distance, - require_torch_accelerator, - torch_device, -) - -from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ..test_pipelines_common import PipelineFromPipeTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin - - -enable_full_determinism() - - -class StableDiffusionDiffEditPipelineFastTests( - PipelineLatentTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase -): - pipeline_class = StableDiffusionDiffEditPipeline - params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"height", "width", "image"} | {"image_latents"} - batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - {"image"} | {"image_latents"} - image_params = frozenset( - [] - ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess - image_latents_params = frozenset([]) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - # SD2-specific config below - attention_head_dim=(2, 4), - use_linear_projection=True, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - inverse_scheduler = DDIMInverseScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_zero=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=512, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "inverse_scheduler": inverse_scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - mask = floats_tensor((1, 16, 16), rng=random.Random(seed)).to(device) - latents = floats_tensor((1, 2, 4, 16, 16), rng=random.Random(seed)).to(device) - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "a dog and a newt", - "mask_image": mask, - "image_latents": latents, - "generator": generator, - "num_inference_steps": 2, - "inpaint_strength": 1.0, - "guidance_scale": 6.0, - "output_type": "np", - } - - return inputs - - def get_dummy_mask_inputs(self, device, seed=0): - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - image = Image.fromarray(np.uint8(image)).convert("RGB") - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "image": image, - "source_prompt": "a cat and a frog", - "target_prompt": "a dog and a newt", - "generator": generator, - "num_inference_steps": 2, - "num_maps_per_mask": 2, - "mask_encode_strength": 1.0, - "guidance_scale": 6.0, - "output_type": "np", - } - - return inputs - - def get_dummy_inversion_inputs(self, device, seed=0): - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - image = Image.fromarray(np.uint8(image)).convert("RGB") - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "image": image, - "prompt": "a cat and a frog", - "generator": generator, - "num_inference_steps": 2, - "inpaint_strength": 1.0, - "guidance_scale": 6.0, - "decode_latents": True, - "output_type": "np", - } - return inputs - - def test_save_load_optional_components(self): - if not hasattr(self.pipeline_class, "_optional_components"): - return - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - # set all optional components to None and update pipeline config accordingly - for optional_component in pipe._optional_components: - setattr(pipe, optional_component, None) - pipe.register_modules(**dict.fromkeys(pipe._optional_components)) - - inputs = self.get_dummy_inputs(torch_device) - output = pipe(**inputs)[0] - - with tempfile.TemporaryDirectory() as tmpdir: - pipe.save_pretrained(tmpdir) - pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) - pipe_loaded.to(torch_device) - pipe_loaded.set_progress_bar_config(disable=None) - - for optional_component in pipe._optional_components: - self.assertTrue( - getattr(pipe_loaded, optional_component) is None, - f"`{optional_component}` did not stay set to None after loading.", - ) - - inputs = self.get_dummy_inputs(torch_device) - output_loaded = pipe_loaded(**inputs)[0] - - max_diff = np.abs(output - output_loaded).max() - self.assertLess(max_diff, 1e-4) - - def test_mask(self): - device = "cpu" - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_mask_inputs(device) - mask = pipe.generate_mask(**inputs) - mask_slice = mask[0, -3:, -3:] - - self.assertEqual(mask.shape, (1, 16, 16)) - expected_slice = np.array([0] * 9) - max_diff = np.abs(mask_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) - self.assertEqual(mask[0, -3, -4], 0) - - def test_inversion(self): - device = "cpu" - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inversion_inputs(device) - image = pipe.invert(**inputs).images - image_slice = image[0, -1, -3:, -3:] - - self.assertEqual(image.shape, (2, 32, 32, 3)) - expected_slice = np.array( - [0.5160, 0.5115, 0.5060, 0.5456, 0.4704, 0.5060, 0.5019, 0.4405, 0.4726], - ) - max_diff = np.abs(image_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(expected_max_diff=5e-3) - - def test_inversion_dpm(self): - device = "cpu" - - components = self.get_dummy_components() - - scheduler_args = {"beta_start": 0.00085, "beta_end": 0.012, "beta_schedule": "scaled_linear"} - components["scheduler"] = DPMSolverMultistepScheduler(**scheduler_args) - components["inverse_scheduler"] = DPMSolverMultistepInverseScheduler(**scheduler_args) - - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inversion_inputs(device) - image = pipe.invert(**inputs).images - image_slice = image[0, -1, -3:, -3:] - - self.assertEqual(image.shape, (2, 32, 32, 3)) - expected_slice = np.array( - [0.5305, 0.4673, 0.5314, 0.5308, 0.4886, 0.5279, 0.5142, 0.4724, 0.4892], - ) - max_diff = np.abs(image_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) - - def test_encode_prompt_works_in_isolation(self): - extra_required_param_value_dict = { - "device": torch.device(torch_device).type, - "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0, - } - return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) - - -@require_torch_accelerator -@nightly -class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - @classmethod - def setUpClass(cls): - raw_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png" - ) - raw_image = raw_image.convert("RGB").resize((256, 256)) - - cls.raw_image = raw_image - - def test_stable_diffusion_diffedit_full(self): - generator = torch.manual_seed(0) - - pipe = StableDiffusionDiffEditPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.scheduler.clip_sample = True - - pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload(device=torch_device) - pipe.set_progress_bar_config(disable=None) - - source_prompt = "a bowl of fruit" - target_prompt = "a bowl of pears" - - mask_image = pipe.generate_mask( - image=self.raw_image, - source_prompt=source_prompt, - target_prompt=target_prompt, - generator=generator, - ) - - inv_latents = pipe.invert( - prompt=source_prompt, - image=self.raw_image, - inpaint_strength=0.7, - generator=generator, - num_inference_steps=5, - ).latents - - image = pipe( - prompt=target_prompt, - mask_image=mask_image, - image_latents=inv_latents, - generator=generator, - negative_prompt=source_prompt, - inpaint_strength=0.7, - num_inference_steps=5, - output_type="np", - ).images[0] - - expected_image = ( - np.array( - load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/diffedit/pears.png" - ).resize((256, 256)) - ) - / 255 - ) - - assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 2e-1 - - -@nightly -@require_torch_accelerator -class StableDiffusionDiffEditPipelineNightlyTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - @classmethod - def setUpClass(cls): - raw_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png" - ) - - raw_image = raw_image.convert("RGB").resize((768, 768)) - - cls.raw_image = raw_image - - def test_stable_diffusion_diffedit_dpm(self): - generator = torch.manual_seed(0) - - pipe = StableDiffusionDiffEditPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) - pipe.inverse_scheduler = DPMSolverMultistepInverseScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - source_prompt = "a bowl of fruit" - target_prompt = "a bowl of pears" - - mask_image = pipe.generate_mask( - image=self.raw_image, - source_prompt=source_prompt, - target_prompt=target_prompt, - generator=generator, - ) - - inv_latents = pipe.invert( - prompt=source_prompt, - image=self.raw_image, - inpaint_strength=0.7, - generator=generator, - num_inference_steps=25, - ).latents - - image = pipe( - prompt=target_prompt, - mask_image=mask_image, - image_latents=inv_latents, - generator=generator, - negative_prompt=source_prompt, - inpaint_strength=0.7, - num_inference_steps=25, - output_type="np", - ).images[0] - - expected_image = ( - np.array( - load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/diffedit/pears.png" - ).resize((768, 768)) - ) - / 255 - ) - assert np.abs((expected_image - image).max()) < 5e-1 diff --git a/tests/pipelines/stable_diffusion_gligen/__init__.py b/tests/pipelines/stable_diffusion_gligen/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py b/tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py deleted file mode 100644 index 5d56f1680318..000000000000 --- a/tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py +++ /dev/null @@ -1,175 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - EulerAncestralDiscreteScheduler, - StableDiffusionGLIGENPipeline, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import enable_full_determinism - -from ..pipeline_params import ( - TEXT_TO_IMAGE_BATCH_PARAMS, - TEXT_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_PARAMS, -) -from ..test_pipelines_common import ( - PipelineFromPipeTesterMixin, - PipelineKarrasSchedulerTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, -) - - -enable_full_determinism() - - -class GligenPipelineFastTests( - PipelineLatentTesterMixin, - PipelineKarrasSchedulerTesterMixin, - PipelineTesterMixin, - PipelineFromPipeTesterMixin, - unittest.TestCase, -): - pipeline_class = StableDiffusionGLIGENPipeline - params = TEXT_TO_IMAGE_PARAMS | {"gligen_phrases", "gligen_boxes"} - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - attention_type="gated", - ) - # unet.position_net = PositionNet(32,32) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A modern livingroom", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "gligen_phrases": ["a birthday cake"], - "gligen_boxes": [[0.2676, 0.6088, 0.4773, 0.7183]], - "output_type": "np", - } - return inputs - - def test_stable_diffusion_gligen_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionGLIGENPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5069, 0.5561, 0.4577, 0.4792, 0.5203, 0.4089, 0.5039, 0.4919, 0.4499]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_gligen_k_euler_ancestral(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionGLIGENPipeline(**components) - sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_attention_slicing_forward_pass(self): - super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3) - - @unittest.skip("Test not supported as tokenizer is used for parsing bounding boxes.") - def test_encode_prompt_works_in_isolation(self): - pass diff --git a/tests/pipelines/stable_diffusion_gligen_text_image/__init__.py b/tests/pipelines/stable_diffusion_gligen_text_image/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py b/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py deleted file mode 100644 index 3f092e02dde5..000000000000 --- a/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py +++ /dev/null @@ -1,215 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import ( - CLIPProcessor, - CLIPTextConfig, - CLIPTextModel, - CLIPTokenizer, - CLIPVisionConfig, - CLIPVisionModelWithProjection, -) - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - EulerAncestralDiscreteScheduler, - StableDiffusionGLIGENTextImagePipeline, - UNet2DConditionModel, -) -from diffusers.pipelines.stable_diffusion import CLIPImageProjection -from diffusers.utils import load_image -from diffusers.utils.testing_utils import enable_full_determinism, torch_device - -from ..pipeline_params import ( - TEXT_TO_IMAGE_BATCH_PARAMS, - TEXT_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_PARAMS, -) -from ..test_pipelines_common import ( - PipelineFromPipeTesterMixin, - PipelineKarrasSchedulerTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, -) - - -enable_full_determinism() - - -class GligenTextImagePipelineFastTests( - PipelineLatentTesterMixin, - PipelineKarrasSchedulerTesterMixin, - PipelineTesterMixin, - PipelineFromPipeTesterMixin, - unittest.TestCase, -): - pipeline_class = StableDiffusionGLIGENTextImagePipeline - params = TEXT_TO_IMAGE_PARAMS | {"gligen_phrases", "gligen_images", "gligen_boxes"} - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - supports_dduf = False - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - attention_type="gated-text-image", - ) - # unet.position_net = PositionNet(32,32) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image_encoder_config = CLIPVisionConfig( - hidden_size=32, - projection_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - ) - image_encoder = CLIPVisionModelWithProjection(image_encoder_config) - processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") - - image_project = CLIPImageProjection(hidden_size=32) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - "image_encoder": image_encoder, - "image_project": image_project, - "processor": processor, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - gligen_images = load_image( - "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/livingroom_modern.png" - ) - inputs = { - "prompt": "A modern livingroom", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "gligen_phrases": ["a birthday cake"], - "gligen_images": [gligen_images], - "gligen_boxes": [[0.2676, 0.6088, 0.4773, 0.7183]], - "output_type": "np", - } - return inputs - - def test_dict_tuple_outputs_equivalent(self): - expected_slice = None - if torch_device == "cpu": - expected_slice = np.array([0.5052, 0.5546, 0.4567, 0.4770, 0.5195, 0.4085, 0.5026, 0.4909, 0.4495]) - super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) - - def test_stable_diffusion_gligen_text_image_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5069, 0.5561, 0.4577, 0.4792, 0.5203, 0.4089, 0.5039, 0.4919, 0.4499]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_gligen_k_euler_ancestral(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components) - sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_attention_slicing_forward_pass(self): - super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3) - - @unittest.skip( - "Test not supported because of the use of `text_encoder` in `get_cross_attention_kwargs_with_grounded()`." - ) - def test_encode_prompt_works_in_isolation(self): - pass diff --git a/tests/pipelines/stable_diffusion_ldm3d/__init__.py b/tests/pipelines/stable_diffusion_ldm3d/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py b/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py deleted file mode 100644 index 936e22b4705e..000000000000 --- a/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py +++ /dev/null @@ -1,326 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - PNDMScheduler, - StableDiffusionLDM3DPipeline, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - nightly, - require_torch_accelerator, - torch_device, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS - - -enable_full_determinism() - - -class StableDiffusionLDM3DPipelineFastTests(unittest.TestCase): - pipeline_class = StableDiffusionLDM3DPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=6, - out_channels=6, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - "image_encoder": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "np", - } - return inputs - - def test_stable_diffusion_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - ldm3d_pipe = StableDiffusionLDM3DPipeline(**components) - ldm3d_pipe = ldm3d_pipe.to(torch_device) - ldm3d_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = ldm3d_pipe(**inputs) - rgb, depth = output.rgb, output.depth - - image_slice_rgb = rgb[0, -3:, -3:, -1] - image_slice_depth = depth[0, -3:, -1] - - assert rgb.shape == (1, 64, 64, 3) - assert depth.shape == (1, 64, 64) - - expected_slice_rgb = np.array( - [0.37338176, 0.70247, 0.74203193, 0.51643604, 0.58256793, 0.60932136, 0.4181095, 0.48355877, 0.46535262] - ) - expected_slice_depth = np.array([103.46727, 85.812004, 87.849236]) - - assert np.abs(image_slice_rgb.flatten() - expected_slice_rgb).max() < 1e-2 - assert np.abs(image_slice_depth.flatten() - expected_slice_depth).max() < 1e-2 - - def test_stable_diffusion_prompt_embeds(self): - components = self.get_dummy_components() - ldm3d_pipe = StableDiffusionLDM3DPipeline(**components) - ldm3d_pipe = ldm3d_pipe.to(torch_device) - ldm3d_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = ldm3d_pipe(**inputs) - rgb_slice_1, depth_slice_1 = output.rgb, output.depth - rgb_slice_1 = rgb_slice_1[0, -3:, -3:, -1] - depth_slice_1 = depth_slice_1[0, -3:, -1] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - text_inputs = ldm3d_pipe.tokenizer( - prompt, - padding="max_length", - max_length=ldm3d_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) - - prompt_embeds = ldm3d_pipe.text_encoder(text_inputs)[0] - - inputs["prompt_embeds"] = prompt_embeds - - # forward - output = ldm3d_pipe(**inputs) - rgb_slice_2, depth_slice_2 = output.rgb, output.depth - rgb_slice_2 = rgb_slice_2[0, -3:, -3:, -1] - depth_slice_2 = depth_slice_2[0, -3:, -1] - - assert np.abs(rgb_slice_1.flatten() - rgb_slice_2.flatten()).max() < 1e-4 - assert np.abs(depth_slice_1.flatten() - depth_slice_2.flatten()).max() < 1e-4 - - def test_stable_diffusion_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - ldm3d_pipe = StableDiffusionLDM3DPipeline(**components) - ldm3d_pipe = ldm3d_pipe.to(device) - ldm3d_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = ldm3d_pipe(**inputs, negative_prompt=negative_prompt) - - rgb, depth = output.rgb, output.depth - rgb_slice = rgb[0, -3:, -3:, -1] - depth_slice = depth[0, -3:, -1] - - assert rgb.shape == (1, 64, 64, 3) - assert depth.shape == (1, 64, 64) - - expected_slice_rgb = np.array( - [0.37044, 0.71811503, 0.7223251, 0.48603675, 0.5638391, 0.6364948, 0.42833704, 0.4901315, 0.47926217] - ) - expected_slice_depth = np.array([107.84738, 84.62802, 89.962135]) - assert np.abs(rgb_slice.flatten() - expected_slice_rgb).max() < 1e-2 - assert np.abs(depth_slice.flatten() - expected_slice_depth).max() < 1e-2 - - -@nightly -@require_torch_accelerator -class StableDiffusionLDM3DPipelineSlowTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "a photograph of an astronaut riding a horse", - "latents": latents, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "np", - } - return inputs - - def test_ldm3d_stable_diffusion(self): - ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d") - ldm3d_pipe = ldm3d_pipe.to(torch_device) - ldm3d_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - output = ldm3d_pipe(**inputs) - rgb, depth = output.rgb, output.depth - rgb_slice = rgb[0, -3:, -3:, -1].flatten() - depth_slice = rgb[0, -3:, -1].flatten() - - assert rgb.shape == (1, 512, 512, 3) - assert depth.shape == (1, 512, 512) - - expected_slice_rgb = np.array( - [0.53805465, 0.56707305, 0.5486515, 0.57012236, 0.5814511, 0.56253487, 0.54843014, 0.55092263, 0.6459706] - ) - expected_slice_depth = np.array( - [0.9263781, 0.6678672, 0.5486515, 0.92202145, 0.67831135, 0.56253487, 0.9241694, 0.7551478, 0.6459706] - ) - assert np.abs(rgb_slice - expected_slice_rgb).max() < 3e-3 - assert np.abs(depth_slice - expected_slice_depth).max() < 3e-3 - - -@nightly -@require_torch_accelerator -class StableDiffusionPipelineNightlyTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "a photograph of an astronaut riding a horse", - "latents": latents, - "generator": generator, - "num_inference_steps": 50, - "guidance_scale": 7.5, - "output_type": "np", - } - return inputs - - def test_ldm3d(self): - ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d").to(torch_device) - ldm3d_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - output = ldm3d_pipe(**inputs) - rgb, depth = output.rgb, output.depth - - expected_rgb_mean = 0.495586 - expected_rgb_std = 0.33795515 - expected_depth_mean = 112.48518 - expected_depth_std = 98.489746 - assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3 - assert np.abs(expected_rgb_std - rgb.std()) < 1e-3 - assert np.abs(expected_depth_mean - depth.mean()) < 1e-3 - assert np.abs(expected_depth_std - depth.std()) < 1e-3 - - def test_ldm3d_v2(self): - ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c").to(torch_device) - ldm3d_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - output = ldm3d_pipe(**inputs) - rgb, depth = output.rgb, output.depth - - expected_rgb_mean = 0.4194127 - expected_rgb_std = 0.35375586 - expected_depth_mean = 0.5638502 - expected_depth_std = 0.34686103 - - assert rgb.shape == (1, 512, 512, 3) - assert depth.shape == (1, 512, 512, 1) - assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3 - assert np.abs(expected_rgb_std - rgb.std()) < 1e-3 - assert np.abs(expected_depth_mean - depth.mean()) < 1e-3 - assert np.abs(expected_depth_std - depth.std()) < 1e-3 diff --git a/tests/pipelines/stable_diffusion_panorama/__init__.py b/tests/pipelines/stable_diffusion_panorama/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py deleted file mode 100644 index 61f91cae2b0d..000000000000 --- a/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py +++ /dev/null @@ -1,444 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - EulerAncestralDiscreteScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionPanoramaPipeline, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import ( - backend_empty_cache, - backend_max_memory_allocated, - backend_reset_max_memory_allocated, - backend_reset_peak_memory_stats, - enable_full_determinism, - nightly, - require_torch_accelerator, - skip_mps, - torch_device, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import ( - IPAdapterTesterMixin, - PipelineFromPipeTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, -) - - -enable_full_determinism() - - -@skip_mps -class StableDiffusionPanoramaPipelineFastTests( - IPAdapterTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, - PipelineFromPipeTesterMixin, - unittest.TestCase, -): - pipeline_class = StableDiffusionPanoramaPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=1, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler() - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - "image_encoder": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "a photo of the dolomites", - "generator": generator, - # Setting height and width to None to prevent OOMs on CPU. - "height": None, - "width": None, - "num_inference_steps": 1, - "guidance_scale": 6.0, - "output_type": "np", - } - return inputs - - def test_stable_diffusion_panorama_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.6186, 0.5374, 0.4915, 0.4135, 0.4114, 0.4563, 0.5128, 0.4977, 0.4757]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_circular_padding_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs, circular_padding=True).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.6127, 0.6299, 0.4595, 0.4051, 0.4543, 0.3925, 0.5510, 0.5693, 0.5031]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # override to speed the overall test timing up. - def test_inference_batch_consistent(self): - super().test_inference_batch_consistent(batch_sizes=[1, 2]) - - # override to speed the overall test timing up. - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=5.0e-3) - - def test_float16_inference(self): - super().test_float16_inference(expected_max_diff=1e-1) - - def test_stable_diffusion_panorama_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_views_batch(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs, view_batch_size=2) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_views_batch_circular_padding(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs, circular_padding=True, view_batch_size=2) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.6127, 0.6299, 0.4595, 0.4051, 0.4543, 0.3925, 0.5510, 0.5693, 0.5031]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" - ) - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.4024, 0.6510, 0.4901, 0.5378, 0.5813, 0.5622, 0.4795, 0.4467, 0.4952]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True - ) - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.6391, 0.6291, 0.4861, 0.5134, 0.5552, 0.4578, 0.5032, 0.5023, 0.4539]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_encode_prompt_works_in_isolation(self): - extra_required_param_value_dict = { - "device": torch.device(torch_device).type, - "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0, - } - return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) - - -@nightly -@require_torch_accelerator -class StableDiffusionPanoramaNightlyTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_inputs(self, seed=0): - generator = torch.manual_seed(seed) - inputs = { - "prompt": "a photo of the dolomites", - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "np", - } - return inputs - - def test_stable_diffusion_panorama_default(self): - model_ckpt = "stabilityai/stable-diffusion-2-base" - scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 2048, 3) - - expected_slice = np.array( - [ - 0.36968392, - 0.27025372, - 0.32446766, - 0.28379387, - 0.36363274, - 0.30733347, - 0.27100027, - 0.27054125, - 0.25536096, - ] - ) - - assert np.abs(expected_slice - image_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_k_lms(self): - pipe = StableDiffusionPanoramaPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base", safety_checker=None - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.unet.set_default_attn_processor() - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - assert image.shape == (1, 512, 2048, 3) - - expected_slice = np.array( - [ - [ - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - ] - ] - ) - - assert np.abs(expected_slice - image_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 256) - latents_slice = latents[0, -3:, -3:, -1] - - expected_slice = np.array( - [ - 0.18681869, - 0.33907816, - 0.5361276, - 0.14432865, - -0.02856611, - -0.73941123, - 0.23397987, - 0.47322682, - -0.37823164, - ] - ) - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 256) - latents_slice = latents[0, -3:, -3:, -1] - - expected_slice = np.array( - [ - 0.18539645, - 0.33987248, - 0.5378559, - 0.14437142, - -0.02455261, - -0.7338317, - 0.23990755, - 0.47356272, - -0.3786505, - ] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - model_ckpt = "stabilityai/stable-diffusion-2-base" - scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == 3 - - def test_stable_diffusion_panorama_pipeline_with_sequential_cpu_offloading(self): - backend_empty_cache(torch_device) - backend_reset_max_memory_allocated(torch_device) - backend_reset_peak_memory_stats(torch_device) - - model_ckpt = "stabilityai/stable-diffusion-2-base" - scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs() - _ = pipe(**inputs) - - mem_bytes = backend_max_memory_allocated(torch_device) - # make sure that less than 5.2 GB is allocated - assert mem_bytes < 5.5 * 10**9 diff --git a/tests/pipelines/stable_diffusion_safe/__init__.py b/tests/pipelines/stable_diffusion_safe/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py deleted file mode 100644 index 5d81cff3e0d3..000000000000 --- a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py +++ /dev/null @@ -1,497 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import tempfile -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel -from diffusers.pipelines.stable_diffusion_safe import StableDiffusionPipelineSafe as StableDiffusionPipeline -from diffusers.utils.testing_utils import ( - Expectations, - backend_empty_cache, - floats_tensor, - nightly, - require_accelerator, - require_torch_accelerator, - torch_device, -) - - -class SafeDiffusionPipelineFastTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_cond_unet(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - @property - def dummy_extractor(self): - def extract(*args, **kwargs): - class Out: - def __init__(self): - self.pixel_values = torch.ones([0]) - - def to(self, device): - self.pixel_values.to(device) - return self - - return Out() - - return extract - - def test_safe_diffusion_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5756, 0.6118, 0.5005, 0.5041, 0.5471, 0.4726, 0.4976, 0.4865, 0.4864]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5125, 0.5716, 0.4828, 0.5060, 0.5650, 0.4768, 0.5185, 0.4895, 0.4993]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_no_safety_checker(self): - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None - ) - assert isinstance(pipe, StableDiffusionPipeline) - assert isinstance(pipe.scheduler, LMSDiscreteScheduler) - assert pipe.safety_checker is None - - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - # check that there's no error when saving a pipeline with one of the models being None - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe = StableDiffusionPipeline.from_pretrained(tmpdirname) - - # sanity check that the pipeline still works - assert pipe.safety_checker is None - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - @require_accelerator - def test_stable_diffusion_fp16(self): - """Test that stable diffusion works with fp16""" - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # put models in fp16 - unet = unet.half() - vae = vae.half() - bert = bert.half() - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images - - assert image.shape == (1, 64, 64, 3) - - -@nightly -@require_torch_accelerator -class SafeDiffusionPipelineIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_harm_safe_stable_diffusion(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None - ) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = ( - "portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle" - " coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with" - " anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and" - " children from bahnhof zoo, detailed " - ) - seed = 4003660346 - guidance_scale = 7 - - # without safety guidance (sld_guidance_scale = 0) - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=0, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slices = Expectations( - { - ("xpu", 3): [0.0076, 0.0058, 0.0012, 0, 0.0047, 0.0046, 0, 0, 0], - ("cuda", 7): [0.2278, 0.2231, 0.2249, 0.2333, 0.2303, 0.1885, 0.2273, 0.2144, 0.2176], - ("cuda", 8): [0.0076, 0.0058, 0.0012, 0, 0.0047, 0.0046, 0, 0, 0], - } - ) - expected_slice = expected_slices.get_expectation() - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # without safety guidance (strong configuration) - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=2000, - sld_warmup_steps=7, - sld_threshold=0.025, - sld_momentum_scale=0.5, - sld_mom_beta=0.7, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719] - - expected_slices = Expectations( - { - ("xpu", 3): [0.0443, 0.0439, 0.0381, 0.0336, 0.0408, 0.0345, 0.0405, 0.0338, 0.0293], - ("cuda", 7): [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719], - ("cuda", 8): [0.0443, 0.0439, 0.0381, 0.0336, 0.0408, 0.0345, 0.0405, 0.0338, 0.0293], - } - ) - expected_slice = expected_slices.get_expectation() - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_nudity_safe_stable_diffusion(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None - ) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "padme amidala taking a bath artwork, safe for work, no nudity" - seed = 2734971755 - guidance_scale = 7 - - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=0, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slices = Expectations( - { - ("xpu", 3): [0.3244, 0.3355, 0.3260, 0.3123, 0.3246, 0.3426, 0.3109, 0.3471, 0.4001], - ("cuda", 7): [0.3502, 0.3622, 0.3396, 0.3642, 0.3478, 0.3318, 0.35, 0.3348, 0.3297], - ("cuda", 8): [0.3605, 0.3684, 0.3712, 0.3624, 0.3675, 0.3726, 0.3494, 0.3748, 0.4044], - } - ) - expected_slice = expected_slices.get_expectation() - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=2000, - sld_warmup_steps=7, - sld_threshold=0.025, - sld_momentum_scale=0.5, - sld_mom_beta=0.7, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slices = Expectations( - { - ("xpu", 3): [0.6178, 0.6260, 0.6194, 0.6435, 0.6265, 0.6461, 0.6567, 0.6576, 0.6444], - ("cuda", 7): [0.5531, 0.5206, 0.4895, 0.5156, 0.5182, 0.4751, 0.4802, 0.4803, 0.4443], - ("cuda", 8): [0.5892, 0.5959, 0.5914, 0.6123, 0.5982, 0.6141, 0.6180, 0.6262, 0.6171], - } - ) - - print(f"image_slice: {image_slice.flatten()}") - expected_slice = expected_slices.get_expectation() - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_nudity_safetychecker_safe_stable_diffusion(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = ( - "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c." - " leyendecker" - ) - seed = 1044355234 - guidance_scale = 12 - - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=0, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-7 - - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=2000, - sld_warmup_steps=7, - sld_threshold=0.025, - sld_momentum_scale=0.5, - sld_mom_beta=0.7, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slices = Expectations( - { - ("xpu", 3): np.array([0.0695, 0.1244, 0.1831, 0.0527, 0.0444, 0.1660, 0.0572, 0.0677, 0.1551]), - ("cuda", 7): np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561]), - ("cuda", 8): np.array([0.0695, 0.1244, 0.1831, 0.0527, 0.0444, 0.1660, 0.0572, 0.0677, 0.1551]), - } - ) - expected_slice = expected_slices.get_expectation() - - assert image.shape == (1, 512, 512, 3) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/pipelines/stable_diffusion_sag/__init__.py b/tests/pipelines/stable_diffusion_sag/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py deleted file mode 100644 index 1d1840332236..000000000000 --- a/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py +++ /dev/null @@ -1,245 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DEISMultistepScheduler, - DPMSolverMultistepScheduler, - EulerDiscreteScheduler, - StableDiffusionSAGPipeline, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - nightly, - require_torch_accelerator, - torch_device, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import ( - IPAdapterTesterMixin, - PipelineFromPipeTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, -) - - -enable_full_determinism() - - -class StableDiffusionSAGPipelineFastTests( - IPAdapterTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, - PipelineFromPipeTesterMixin, - unittest.TestCase, -): - pipeline_class = StableDiffusionSAGPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(4, 8), - layers_per_block=2, - sample_size=8, - norm_num_groups=1, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=8, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[4, 8], - norm_num_groups=1, - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=8, - num_hidden_layers=2, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - "image_encoder": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": ".", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 1.0, - "sag_scale": 1.0, - "output_type": "np", - } - return inputs - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(expected_max_diff=3e-3) - - @unittest.skip("Not necessary to test here.") - def test_xformers_attention_forwardGenerator_pass(self): - pass - - def test_pipeline_different_schedulers(self): - pipeline = self.pipeline_class(**self.get_dummy_components()) - inputs = self.get_dummy_inputs("cpu") - - expected_image_size = (16, 16, 3) - for scheduler_cls in [DDIMScheduler, DEISMultistepScheduler, DPMSolverMultistepScheduler]: - pipeline.scheduler = scheduler_cls.from_config(pipeline.scheduler.config) - image = pipeline(**inputs).images[0] - - shape = image.shape - assert shape == expected_image_size - - pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config) - - with self.assertRaises(ValueError): - # Karras schedulers are not supported - image = pipeline(**inputs).images[0] - - def test_encode_prompt_works_in_isolation(self): - extra_required_param_value_dict = { - "device": torch.device(torch_device).type, - "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0, - } - return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) - - -@nightly -@require_torch_accelerator -class StableDiffusionPipelineIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_stable_diffusion_1(self): - sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - sag_pipe = sag_pipe.to(torch_device) - sag_pipe.set_progress_bar_config(disable=None) - - prompt = "." - generator = torch.manual_seed(0) - output = sag_pipe( - [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np" - ) - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.1568, 0.1738, 0.1695, 0.1693, 0.1507, 0.1705, 0.1547, 0.1751, 0.1949]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2 - - def test_stable_diffusion_2(self): - sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") - sag_pipe = sag_pipe.to(torch_device) - sag_pipe.set_progress_bar_config(disable=None) - - prompt = "." - generator = torch.manual_seed(0) - output = sag_pipe( - [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np" - ) - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.3459, 0.2876, 0.2537, 0.3002, 0.2671, 0.2160, 0.3026, 0.2262, 0.2371]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2 - - def test_stable_diffusion_2_non_square(self): - sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") - sag_pipe = sag_pipe.to(torch_device) - sag_pipe.set_progress_bar_config(disable=None) - - prompt = "." - generator = torch.manual_seed(0) - output = sag_pipe( - [prompt], - width=768, - height=512, - generator=generator, - guidance_scale=7.5, - sag_scale=1.0, - num_inference_steps=20, - output_type="np", - ) - - image = output.images - - assert image.shape == (1, 512, 768, 3) diff --git a/tests/pipelines/text_to_video_synthesis/__init__.py b/tests/pipelines/text_to_video_synthesis/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py deleted file mode 100644 index 445f876985f2..000000000000 --- a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py +++ /dev/null @@ -1,231 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoSDPipeline, UNet3DConditionModel -from diffusers.utils import is_xformers_available -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - load_numpy, - numpy_cosine_similarity_distance, - require_torch_accelerator, - skip_mps, - slow, - torch_device, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin - - -enable_full_determinism() - - -@skip_mps -class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin, unittest.TestCase): - pipeline_class = TextToVideoSDPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - # No `output_type`. - required_optional_params = frozenset( - [ - "num_inference_steps", - "generator", - "latents", - "return_dict", - "callback", - "callback_steps", - ] - ) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet3DConditionModel( - block_out_channels=(8, 8), - layers_per_block=1, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"), - up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"), - cross_attention_dim=4, - attention_head_dim=4, - norm_num_groups=2, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=(8,), - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D"], - latent_channels=4, - sample_size=32, - norm_num_groups=2, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=4, - intermediate_size=16, - layer_norm_eps=1e-05, - num_attention_heads=2, - num_hidden_layers=2, - pad_token_id=1, - vocab_size=1000, - hidden_act="gelu", - projection_dim=32, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "pt", - } - return inputs - - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - def test_text_to_video_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = TextToVideoSDPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["output_type"] = "np" - frames = sd_pipe(**inputs).frames - - image_slice = frames[0][0][-3:, -3:, -1] - assert frames[0][0].shape == (32, 32, 3) - expected_slice = np.array([0.8093, 0.2751, 0.6976, 0.5927, 0.4616, 0.4336, 0.5094, 0.5683, 0.4796]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - @unittest.skipIf(torch_device != "cuda", reason="Feature isn't heavily used. Test in CUDA environment only.") - def test_attention_slicing_forward_pass(self): - self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3) - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2) - - # (todo): sayakpaul - @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") - def test_inference_batch_consistent(self): - pass - - # (todo): sayakpaul - @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") - def test_inference_batch_single_identical(self): - pass - - @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.") - def test_num_images_per_prompt(self): - pass - - def test_encode_prompt_works_in_isolation(self): - extra_required_param_value_dict = { - "device": torch.device(torch_device).type, - "num_images_per_prompt": 1, - "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0, - } - return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) - - -@slow -@skip_mps -@require_torch_accelerator -class TextToVideoSDPipelineSlowTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_two_step_model(self): - expected_video = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/video_2step.npy" - ) - - pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b") - pipe = pipe.to(torch_device) - - prompt = "Spiderman is surfing" - generator = torch.Generator(device="cpu").manual_seed(0) - - video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").frames - assert numpy_cosine_similarity_distance(expected_video.flatten(), video_frames.flatten()) < 1e-4 - - def test_two_step_model_with_freeu(self): - expected_video = [] - - pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b") - pipe = pipe.to(torch_device) - - prompt = "Spiderman is surfing" - generator = torch.Generator(device="cpu").manual_seed(0) - - pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4) - video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").frames - video = video_frames[0, 0, -3:, -3:, -1].flatten() - - expected_video = [0.3643, 0.3455, 0.3831, 0.3923, 0.2978, 0.3247, 0.3278, 0.3201, 0.3475] - - assert np.abs(expected_video - video).mean() < 5e-2 diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py deleted file mode 100644 index 8c29b27416c5..000000000000 --- a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import torch - -from diffusers import DDIMScheduler, TextToVideoZeroPipeline -from diffusers.utils.testing_utils import ( - backend_empty_cache, - load_pt, - nightly, - require_torch_accelerator, - torch_device, -) - -from ..test_pipelines_common import assert_mean_pixel_difference - - -@nightly -@require_torch_accelerator -class TextToVideoZeroPipelineSlowTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_full_model(self): - model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5" - pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - generator = torch.Generator(device="cpu").manual_seed(0) - - prompt = "A bear is playing a guitar on Times Square" - result = pipe(prompt=prompt, generator=generator).images - - expected_result = load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt", - weights_only=False, - ) - - assert_mean_pixel_difference(result, expected_result) diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py deleted file mode 100644 index da60435d0dcb..000000000000 --- a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py +++ /dev/null @@ -1,403 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import inspect -import tempfile -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer - -from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoZeroSDXLPipeline, UNet2DConditionModel -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - nightly, - require_accelerate_version_greater, - require_torch_accelerator, - torch_device, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineFromPipeTesterMixin, PipelineTesterMixin - - -enable_full_determinism() - - -def to_np(tensor): - if isinstance(tensor, torch.Tensor): - tensor = tensor.detach().cpu().numpy() - - return tensor - - -class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase): - pipeline_class = TextToVideoZeroSDXLPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - generator_device = "cpu" - - def get_dummy_components(self, seed=0): - torch.manual_seed(seed) - unet = UNet2DConditionModel( - block_out_channels=(2, 4), - layers_per_block=2, - sample_size=2, - norm_num_groups=2, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - # SD2-specific config below - attention_head_dim=(2, 4), - use_linear_projection=True, - addition_embed_type="text_time", - addition_time_embed_dim=8, - transformer_layers_per_block=(1, 2), - projection_class_embeddings_input_dim=80, # 6 * 8 + 32 - cross_attention_dim=64, - ) - scheduler = DDIMScheduler( - num_train_timesteps=1000, - beta_start=0.0001, - beta_end=0.02, - beta_schedule="linear", - trained_betas=None, - clip_sample=True, - set_alpha_to_one=True, - steps_offset=0, - prediction_type="epsilon", - thresholding=False, - dynamic_thresholding_ratio=0.995, - clip_sample_range=1.0, - sample_max_value=1.0, - timestep_spacing="leading", - rescale_betas_zero_snr=False, - ) - torch.manual_seed(seed) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(seed) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=32, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config) - tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "text_encoder_2": text_encoder_2, - "tokenizer_2": tokenizer_2, - "image_encoder": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A panda dancing in Antarctica", - "generator": generator, - "num_inference_steps": 5, - "t0": 1, - "t1": 3, - "height": 64, - "width": 64, - "video_length": 3, - "output_type": "np", - } - return inputs - - def get_generator(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - return generator - - def test_text_to_video_zero_sdxl(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - - inputs = self.get_dummy_inputs(self.generator_device) - result = pipe(**inputs).images - - first_frame_slice = result[0, -3:, -3:, -1] - last_frame_slice = result[-1, -3:, -3:, 0] - - expected_slice1 = np.array( - [0.6008109, 0.73051643, 0.51778656, 0.55817354, 0.45222935, 0.45998418, 0.57017255, 0.54874814, 0.47078788] - ) - expected_slice2 = np.array( - [0.6011751, 0.47420046, 0.41660714, 0.6472957, 0.41261768, 0.5438129, 0.7401535, 0.6756011, 0.53652245] - ) - - assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2 - assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2 - - @unittest.skip( - reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor." - ) - def test_attention_slicing_forward_pass(self): - pass - - def test_cfg(self): - sig = inspect.signature(self.pipeline_class.__call__) - if "guidance_scale" not in sig.parameters: - return - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(self.generator_device) - - inputs["guidance_scale"] = 1.0 - out_no_cfg = pipe(**inputs)[0] - - inputs["guidance_scale"] = 7.5 - out_cfg = pipe(**inputs)[0] - - assert out_cfg.shape == out_no_cfg.shape - - def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(self.generator_device))[0] - output_tuple = pipe(**self.get_dummy_inputs(self.generator_device), return_dict=False)[0] - - max_diff = np.abs(to_np(output) - to_np(output_tuple)).max() - self.assertLess(max_diff, expected_max_difference) - - @unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU") - @require_torch_accelerator - def test_float16_inference(self, expected_max_diff=5e-2): - components = self.get_dummy_components() - for name, module in components.items(): - if hasattr(module, "half"): - components[name] = module.to(torch_device).half() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - components = self.get_dummy_components() - pipe_fp16 = self.pipeline_class(**components) - pipe_fp16.to(torch_device, torch.float16) - pipe_fp16.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(self.generator_device) - # # Reset generator in case it is used inside dummy inputs - if "generator" in inputs: - inputs["generator"] = self.get_generator(self.generator_device) - - output = pipe(**inputs)[0] - - fp16_inputs = self.get_dummy_inputs(self.generator_device) - # Reset generator in case it is used inside dummy inputs - if "generator" in fp16_inputs: - fp16_inputs["generator"] = self.get_generator(self.generator_device) - - output_fp16 = pipe_fp16(**fp16_inputs)[0] - - max_diff = np.abs(to_np(output) - to_np(output_fp16)).max() - self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.") - - @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") - def test_inference_batch_consistent(self): - pass - - @unittest.skip( - reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor." - ) - def test_inference_batch_single_identical(self): - pass - - @require_torch_accelerator - @require_accelerate_version_greater("0.17.0") - def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(self.generator_device) - output_without_offload = pipe(**inputs)[0] - - pipe.enable_model_cpu_offload(device=torch_device) - inputs = self.get_dummy_inputs(self.generator_device) - output_with_offload = pipe(**inputs)[0] - - max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max() - self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results") - - @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.") - def test_pipeline_call_signature(self): - pass - - @unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU") - @require_torch_accelerator - def test_save_load_float16(self, expected_max_diff=1e-2): - components = self.get_dummy_components() - for name, module in components.items(): - if hasattr(module, "half"): - components[name] = module.to(torch_device).half() - - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(self.generator_device) - output = pipe(**inputs)[0] - - with tempfile.TemporaryDirectory() as tmpdir: - pipe.save_pretrained(tmpdir) - pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16) - pipe_loaded.to(torch_device) - pipe_loaded.set_progress_bar_config(disable=None) - - for name, component in pipe_loaded.components.items(): - if hasattr(component, "dtype"): - self.assertTrue( - component.dtype == torch.float16, - f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.", - ) - - inputs = self.get_dummy_inputs(self.generator_device) - output_loaded = pipe_loaded(**inputs)[0] - max_diff = np.abs(to_np(output) - to_np(output_loaded)).max() - self.assertLess( - max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading." - ) - - @unittest.skip( - reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor." - ) - def test_save_load_local(self): - pass - - @unittest.skip( - reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor." - ) - def test_save_load_optional_components(self): - pass - - @unittest.skip( - reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor." - ) - def test_sequential_cpu_offload_forward_pass(self): - pass - - @require_torch_accelerator - def test_to_device(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - pipe.to("cpu") - model_devices = [component.device.type for component in components.values() if hasattr(component, "device")] - self.assertTrue(all(device == "cpu" for device in model_devices)) - - output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0] # generator set to cpu - self.assertTrue(np.isnan(output_cpu).sum() == 0) - - pipe.to(torch_device) - model_devices = [component.device.type for component in components.values() if hasattr(component, "device")] - self.assertTrue(all(device == torch_device for device in model_devices)) - - output_device = pipe(**self.get_dummy_inputs("cpu"))[0] # generator set to cpu - self.assertTrue(np.isnan(to_np(output_device)).sum() == 0) - - @unittest.skip( - reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor." - ) - def test_xformers_attention_forwardGenerator_pass(self): - pass - - -@nightly -@require_torch_accelerator -class TextToVideoZeroSDXLPipelineSlowTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_full_model(self): - model_id = "stabilityai/stable-diffusion-xl-base-1.0" - pipe = TextToVideoZeroSDXLPipeline.from_pretrained( - model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True - ) - pipe.enable_model_cpu_offload() - pipe.enable_vae_slicing() - - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - generator = torch.Generator(device="cpu").manual_seed(0) - - prompt = "A panda dancing in Antarctica" - result = pipe(prompt=prompt, generator=generator).images - - first_frame_slice = result[0, -3:, -3:, -1] - last_frame_slice = result[-1, -3:, -3:, 0] - - expected_slice1 = np.array([0.57, 0.57, 0.57, 0.57, 0.57, 0.56, 0.55, 0.56, 0.56]) - expected_slice2 = np.array([0.54, 0.53, 0.53, 0.53, 0.53, 0.52, 0.53, 0.53, 0.53]) - - assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2 - assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2 diff --git a/tests/pipelines/text_to_video_synthesis/test_video_to_video.py b/tests/pipelines/text_to_video_synthesis/test_video_to_video.py deleted file mode 100644 index 2efef3d640ae..000000000000 --- a/tests/pipelines/text_to_video_synthesis/test_video_to_video.py +++ /dev/null @@ -1,229 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - UNet3DConditionModel, - VideoToVideoSDPipeline, -) -from diffusers.utils import is_xformers_available -from diffusers.utils.testing_utils import ( - enable_full_determinism, - floats_tensor, - is_flaky, - nightly, - numpy_cosine_similarity_distance, - skip_mps, - torch_device, -) - -from ..pipeline_params import ( - TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS, -) -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -@skip_mps -class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = VideoToVideoSDPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS.union({"video"}) - {"image", "width", "height"} - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"video"}) - {"image"} - required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - test_attention_slicing = False - - # No `output_type`. - required_optional_params = frozenset( - [ - "num_inference_steps", - "generator", - "latents", - "return_dict", - "callback", - "callback_steps", - ] - ) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet3DConditionModel( - block_out_channels=(4, 8), - layers_per_block=1, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"), - up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"), - cross_attention_dim=32, - attention_head_dim=4, - norm_num_groups=2, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=True, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[ - 8, - ], - in_channels=3, - out_channels=3, - down_block_types=[ - "DownEncoderBlock2D", - ], - up_block_types=["UpDecoderBlock2D"], - latent_channels=4, - sample_size=32, - norm_num_groups=2, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - hidden_act="gelu", - projection_dim=512, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - } - return components - - def get_dummy_inputs(self, device, seed=0): - # 3 frames - video = floats_tensor((1, 3, 3, 32, 32), rng=random.Random(seed)).to(device) - - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "video": video, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "pt", - } - return inputs - - def test_text_to_video_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = VideoToVideoSDPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["output_type"] = "np" - frames = sd_pipe(**inputs).frames - image_slice = frames[0][0][-3:, -3:, -1] - - assert frames[0][0].shape == (32, 32, 3) - expected_slice = np.array([0.6391, 0.5350, 0.5202, 0.5521, 0.5453, 0.5393, 0.6652, 0.5270, 0.5185]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - @is_flaky() - def test_save_load_optional_components(self): - super().test_save_load_optional_components(expected_max_difference=0.001) - - @is_flaky() - def test_dict_tuple_outputs_equivalent(self): - super().test_dict_tuple_outputs_equivalent() - - @is_flaky() - def test_save_load_local(self): - super().test_save_load_local() - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=5e-3) - - # (todo): sayakpaul - @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") - def test_inference_batch_consistent(self): - pass - - # (todo): sayakpaul - @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") - def test_inference_batch_single_identical(self): - pass - - @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.") - def test_num_images_per_prompt(self): - pass - - def test_encode_prompt_works_in_isolation(self): - extra_required_param_value_dict = { - "device": torch.device(torch_device).type, - "num_images_per_prompt": 1, - "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0, - } - return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) - - -@nightly -@skip_mps -class VideoToVideoSDPipelineSlowTests(unittest.TestCase): - def test_two_step_model(self): - pipe = VideoToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16) - pipe.enable_model_cpu_offload() - - # 10 frames - generator = torch.Generator(device="cpu").manual_seed(0) - video = torch.randn((1, 10, 3, 320, 576), generator=generator) - - prompt = "Spiderman is surfing" - - generator = torch.Generator(device="cpu").manual_seed(0) - video_frames = pipe(prompt, video=video, generator=generator, num_inference_steps=3, output_type="np").frames - - expected_array = np.array( - [0.17114258, 0.13720703, 0.08886719, 0.14819336, 0.1730957, 0.24584961, 0.22021484, 0.35180664, 0.2607422] - ) - output_array = video_frames[0, 0, :3, :3, 0].flatten() - assert numpy_cosine_similarity_distance(expected_array, output_array) < 1e-3 diff --git a/tests/pipelines/unclip/__init__.py b/tests/pipelines/unclip/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py deleted file mode 100644 index 4a970a4f6f6e..000000000000 --- a/tests/pipelines/unclip/test_unclip.py +++ /dev/null @@ -1,523 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer - -from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel -from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel -from diffusers.utils.testing_utils import ( - backend_empty_cache, - backend_max_memory_allocated, - backend_reset_max_memory_allocated, - backend_reset_peak_memory_stats, - enable_full_determinism, - load_numpy, - nightly, - require_torch_accelerator, - skip_mps, - torch_device, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference - - -enable_full_determinism() - - -class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = UnCLIPPipeline - params = TEXT_TO_IMAGE_PARAMS - { - "negative_prompt", - "height", - "width", - "negative_prompt_embeds", - "guidance_scale", - "prompt_embeds", - "cross_attention_kwargs", - } - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - required_optional_params = [ - "generator", - "return_dict", - "prior_num_inference_steps", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - test_xformers_attention = False - - @property - def text_embedder_hidden_size(self): - return 32 - - @property - def time_input_dim(self): - return 32 - - @property - def block_out_channels_0(self): - return self.time_input_dim - - @property - def time_embed_dim(self): - return self.time_input_dim * 4 - - @property - def cross_attention_dim(self): - return 100 - - @property - def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - return tokenizer - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=self.text_embedder_hidden_size, - projection_dim=self.text_embedder_hidden_size, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModelWithProjection(config) - - @property - def dummy_prior(self): - torch.manual_seed(0) - - model_kwargs = { - "num_attention_heads": 2, - "attention_head_dim": 12, - "embedding_dim": self.text_embedder_hidden_size, - "num_layers": 1, - } - - model = PriorTransformer(**model_kwargs) - return model - - @property - def dummy_text_proj(self): - torch.manual_seed(0) - - model_kwargs = { - "clip_embeddings_dim": self.text_embedder_hidden_size, - "time_embed_dim": self.time_embed_dim, - "cross_attention_dim": self.cross_attention_dim, - } - - model = UnCLIPTextProjModel(**model_kwargs) - return model - - @property - def dummy_decoder(self): - torch.manual_seed(0) - - model_kwargs = { - "sample_size": 32, - # RGB in channels - "in_channels": 3, - # Out channels is double in channels because predicts mean and variance - "out_channels": 6, - "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), - "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), - "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", - "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), - "layers_per_block": 1, - "cross_attention_dim": self.cross_attention_dim, - "attention_head_dim": 4, - "resnet_time_scale_shift": "scale_shift", - "class_embed_type": "identity", - } - - model = UNet2DConditionModel(**model_kwargs) - return model - - @property - def dummy_super_res_kwargs(self): - return { - "sample_size": 64, - "layers_per_block": 1, - "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"), - "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"), - "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), - "in_channels": 6, - "out_channels": 3, - } - - @property - def dummy_super_res_first(self): - torch.manual_seed(0) - - model = UNet2DModel(**self.dummy_super_res_kwargs) - return model - - @property - def dummy_super_res_last(self): - # seeded differently to get different unet than `self.dummy_super_res_first` - torch.manual_seed(1) - - model = UNet2DModel(**self.dummy_super_res_kwargs) - return model - - def get_dummy_components(self): - prior = self.dummy_prior - decoder = self.dummy_decoder - text_proj = self.dummy_text_proj - text_encoder = self.dummy_text_encoder - tokenizer = self.dummy_tokenizer - super_res_first = self.dummy_super_res_first - super_res_last = self.dummy_super_res_last - - prior_scheduler = UnCLIPScheduler( - variance_type="fixed_small_log", - prediction_type="sample", - num_train_timesteps=1000, - clip_sample_range=5.0, - ) - - decoder_scheduler = UnCLIPScheduler( - variance_type="learned_range", - prediction_type="epsilon", - num_train_timesteps=1000, - ) - - super_res_scheduler = UnCLIPScheduler( - variance_type="fixed_small_log", - prediction_type="epsilon", - num_train_timesteps=1000, - ) - - components = { - "prior": prior, - "decoder": decoder, - "text_proj": text_proj, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "super_res_first": super_res_first, - "super_res_last": super_res_last, - "prior_scheduler": prior_scheduler, - "decoder_scheduler": decoder_scheduler, - "super_res_scheduler": super_res_scheduler, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "horse", - "generator": generator, - "prior_num_inference_steps": 2, - "decoder_num_inference_steps": 2, - "super_res_num_inference_steps": 2, - "output_type": "np", - } - return inputs - - def test_unclip(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(device)) - image = output.images - - image_from_tuple = pipe( - **self.get_dummy_inputs(device), - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array( - [ - 0.9997, - 0.9988, - 0.0028, - 0.9997, - 0.9984, - 0.9965, - 0.0029, - 0.9986, - 0.0025, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_unclip_passed_text_embed(self): - device = torch.device("cpu") - - class DummyScheduler: - init_noise_sigma = 1 - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - prior = components["prior"] - decoder = components["decoder"] - super_res_first = components["super_res_first"] - tokenizer = components["tokenizer"] - text_encoder = components["text_encoder"] - - generator = torch.Generator(device=device).manual_seed(0) - dtype = prior.dtype - batch_size = 1 - - shape = (batch_size, prior.config.embedding_dim) - prior_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - shape = (batch_size, decoder.config.in_channels, decoder.config.sample_size, decoder.config.sample_size) - generator = torch.Generator(device=device).manual_seed(0) - decoder_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - - shape = ( - batch_size, - super_res_first.config.in_channels // 2, - super_res_first.config.sample_size, - super_res_first.config.sample_size, - ) - super_res_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - - pipe.set_progress_bar_config(disable=None) - - prompt = "this is a prompt example" - - generator = torch.Generator(device=device).manual_seed(0) - output = pipe( - [prompt], - generator=generator, - prior_num_inference_steps=2, - decoder_num_inference_steps=2, - super_res_num_inference_steps=2, - prior_latents=prior_latents, - decoder_latents=decoder_latents, - super_res_latents=super_res_latents, - output_type="np", - ) - image = output.images - - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - return_tensors="pt", - ) - text_model_output = text_encoder(text_inputs.input_ids) - text_attention_mask = text_inputs.attention_mask - - generator = torch.Generator(device=device).manual_seed(0) - image_from_text = pipe( - generator=generator, - prior_num_inference_steps=2, - decoder_num_inference_steps=2, - super_res_num_inference_steps=2, - prior_latents=prior_latents, - decoder_latents=decoder_latents, - super_res_latents=super_res_latents, - text_model_output=text_model_output, - text_attention_mask=text_attention_mask, - output_type="np", - )[0] - - # make sure passing text embeddings manually is identical - assert np.abs(image - image_from_text).max() < 1e-4 - - # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass - # because UnCLIP GPU undeterminism requires a looser check. - @skip_mps - def test_attention_slicing_forward_pass(self): - test_max_difference = torch_device == "cpu" - - self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01) - - # Overriding PipelineTesterMixin::test_inference_batch_single_identical - # because UnCLIP undeterminism requires a looser check. - @skip_mps - def test_inference_batch_single_identical(self): - additional_params_copy_to_batched_inputs = [ - "prior_num_inference_steps", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - - self._test_inference_batch_single_identical( - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=9.8e-3 - ) - - def test_inference_batch_consistent(self): - additional_params_copy_to_batched_inputs = [ - "prior_num_inference_steps", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - - if torch_device == "mps": - # TODO: MPS errors with larger batch sizes - batch_sizes = [2, 3] - self._test_inference_batch_consistent( - batch_sizes=batch_sizes, - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, - ) - else: - self._test_inference_batch_consistent( - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs - ) - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local(expected_max_difference=5e-3) - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - @unittest.skip("UnCLIP produces very large differences in fp16 vs fp32. Test is not useful.") - def test_float16_inference(self): - super().test_float16_inference(expected_max_diff=1.0) - - -@nightly -class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_unclip_karlo_cpu_fp32(self): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/unclip/karlo_v1_alpha_horse_cpu.npy" - ) - - pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha") - pipeline.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipeline( - "horse", - num_images_per_prompt=1, - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - assert np.abs(expected_image - image).max() < 1e-1 - - -@nightly -@require_torch_accelerator -class UnCLIPPipelineIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_unclip_karlo(self): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/unclip/karlo_v1_alpha_horse_fp16.npy" - ) - - pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16) - pipeline = pipeline.to(torch_device) - pipeline.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipeline( - "horse", - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - - assert_mean_pixel_difference(image, expected_image) - - def test_unclip_pipeline_with_sequential_cpu_offloading(self): - backend_empty_cache(torch_device) - backend_reset_max_memory_allocated(torch_device) - backend_reset_peak_memory_stats(torch_device) - - pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - pipe.enable_sequential_cpu_offload() - - _ = pipe( - "horse", - num_images_per_prompt=1, - prior_num_inference_steps=2, - decoder_num_inference_steps=2, - super_res_num_inference_steps=2, - output_type="np", - ) - - mem_bytes = backend_max_memory_allocated(torch_device) - # make sure that less than 7 GB is allocated - assert mem_bytes < 7 * 10**9 diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py deleted file mode 100644 index 15733513a558..000000000000 --- a/tests/pipelines/unclip/test_unclip_image_variation.py +++ /dev/null @@ -1,540 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from transformers import ( - CLIPImageProcessor, - CLIPTextConfig, - CLIPTextModelWithProjection, - CLIPTokenizer, - CLIPVisionConfig, - CLIPVisionModelWithProjection, -) - -from diffusers import ( - DiffusionPipeline, - UnCLIPImageVariationPipeline, - UnCLIPScheduler, - UNet2DConditionModel, - UNet2DModel, -) -from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - floats_tensor, - load_image, - load_numpy, - nightly, - require_torch_accelerator, - skip_mps, - torch_device, -) - -from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS -from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference - - -enable_full_determinism() - - -class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = UnCLIPImageVariationPipeline - params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"} - batch_params = IMAGE_VARIATION_BATCH_PARAMS - - required_optional_params = [ - "generator", - "return_dict", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - test_xformers_attention = False - supports_dduf = False - - @property - def text_embedder_hidden_size(self): - return 32 - - @property - def time_input_dim(self): - return 32 - - @property - def block_out_channels_0(self): - return self.time_input_dim - - @property - def time_embed_dim(self): - return self.time_input_dim * 4 - - @property - def cross_attention_dim(self): - return 100 - - @property - def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - return tokenizer - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=self.text_embedder_hidden_size, - projection_dim=self.text_embedder_hidden_size, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModelWithProjection(config) - - @property - def dummy_image_encoder(self): - torch.manual_seed(0) - config = CLIPVisionConfig( - hidden_size=self.text_embedder_hidden_size, - projection_dim=self.text_embedder_hidden_size, - num_hidden_layers=5, - num_attention_heads=4, - image_size=32, - intermediate_size=37, - patch_size=1, - ) - return CLIPVisionModelWithProjection(config) - - @property - def dummy_text_proj(self): - torch.manual_seed(0) - - model_kwargs = { - "clip_embeddings_dim": self.text_embedder_hidden_size, - "time_embed_dim": self.time_embed_dim, - "cross_attention_dim": self.cross_attention_dim, - } - - model = UnCLIPTextProjModel(**model_kwargs) - return model - - @property - def dummy_decoder(self): - torch.manual_seed(0) - - model_kwargs = { - "sample_size": 32, - # RGB in channels - "in_channels": 3, - # Out channels is double in channels because predicts mean and variance - "out_channels": 6, - "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), - "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), - "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", - "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), - "layers_per_block": 1, - "cross_attention_dim": self.cross_attention_dim, - "attention_head_dim": 4, - "resnet_time_scale_shift": "scale_shift", - "class_embed_type": "identity", - } - - model = UNet2DConditionModel(**model_kwargs) - return model - - @property - def dummy_super_res_kwargs(self): - return { - "sample_size": 64, - "layers_per_block": 1, - "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"), - "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"), - "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), - "in_channels": 6, - "out_channels": 3, - } - - @property - def dummy_super_res_first(self): - torch.manual_seed(0) - - model = UNet2DModel(**self.dummy_super_res_kwargs) - return model - - @property - def dummy_super_res_last(self): - # seeded differently to get different unet than `self.dummy_super_res_first` - torch.manual_seed(1) - - model = UNet2DModel(**self.dummy_super_res_kwargs) - return model - - def get_dummy_components(self): - decoder = self.dummy_decoder - text_proj = self.dummy_text_proj - text_encoder = self.dummy_text_encoder - tokenizer = self.dummy_tokenizer - super_res_first = self.dummy_super_res_first - super_res_last = self.dummy_super_res_last - - decoder_scheduler = UnCLIPScheduler( - variance_type="learned_range", - prediction_type="epsilon", - num_train_timesteps=1000, - ) - - super_res_scheduler = UnCLIPScheduler( - variance_type="fixed_small_log", - prediction_type="epsilon", - num_train_timesteps=1000, - ) - - feature_extractor = CLIPImageProcessor(crop_size=32, size=32) - - image_encoder = self.dummy_image_encoder - - return { - "decoder": decoder, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "text_proj": text_proj, - "feature_extractor": feature_extractor, - "image_encoder": image_encoder, - "super_res_first": super_res_first, - "super_res_last": super_res_last, - "decoder_scheduler": decoder_scheduler, - "super_res_scheduler": super_res_scheduler, - } - - def get_dummy_inputs(self, device, seed=0, pil_image=True): - input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - if pil_image: - input_image = input_image * 0.5 + 0.5 - input_image = input_image.clamp(0, 1) - input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy() - input_image = DiffusionPipeline.numpy_to_pil(input_image)[0] - - return { - "image": input_image, - "generator": generator, - "decoder_num_inference_steps": 2, - "super_res_num_inference_steps": 2, - "output_type": "np", - } - - def test_unclip_image_variation_input_tensor(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=False) - - output = pipe(**pipeline_inputs) - image = output.images - - tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=False) - - image_from_tuple = pipe( - **tuple_pipeline_inputs, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array( - [ - 0.9997, - 0.0002, - 0.9997, - 0.9997, - 0.9969, - 0.0023, - 0.9997, - 0.9969, - 0.9970, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_unclip_image_variation_input_image(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=True) - - output = pipe(**pipeline_inputs) - image = output.images - - tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True) - - image_from_tuple = pipe( - **tuple_pipeline_inputs, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.9997, 0.0003, 0.9997, 0.9997, 0.9970, 0.0024, 0.9997, 0.9971, 0.9971]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_unclip_image_variation_input_list_images(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=True) - pipeline_inputs["image"] = [ - pipeline_inputs["image"], - pipeline_inputs["image"], - ] - - output = pipe(**pipeline_inputs) - image = output.images - - tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True) - tuple_pipeline_inputs["image"] = [ - tuple_pipeline_inputs["image"], - tuple_pipeline_inputs["image"], - ] - - image_from_tuple = pipe( - **tuple_pipeline_inputs, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (2, 64, 64, 3) - - expected_slice = np.array( - [ - 0.9997, - 0.9989, - 0.0008, - 0.0021, - 0.9960, - 0.0018, - 0.0014, - 0.0002, - 0.9933, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_unclip_passed_image_embed(self): - device = torch.device("cpu") - - class DummyScheduler: - init_noise_sigma = 1 - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device=device).manual_seed(0) - dtype = pipe.decoder.dtype - batch_size = 1 - - shape = ( - batch_size, - pipe.decoder.config.in_channels, - pipe.decoder.config.sample_size, - pipe.decoder.config.sample_size, - ) - decoder_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - - shape = ( - batch_size, - pipe.super_res_first.config.in_channels // 2, - pipe.super_res_first.config.sample_size, - pipe.super_res_first.config.sample_size, - ) - generator = torch.Generator(device=device).manual_seed(0) - super_res_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=False) - - img_out_1 = pipe( - **pipeline_inputs, decoder_latents=decoder_latents, super_res_latents=super_res_latents - ).images - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=False) - # Don't pass image, instead pass embedding - image = pipeline_inputs.pop("image") - image_embeddings = pipe.image_encoder(image).image_embeds - - img_out_2 = pipe( - **pipeline_inputs, - decoder_latents=decoder_latents, - super_res_latents=super_res_latents, - image_embeddings=image_embeddings, - ).images - - # make sure passing text embeddings manually is identical - assert np.abs(img_out_1 - img_out_2).max() < 1e-4 - - # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass - # because UnCLIP GPU undeterminism requires a looser check. - @skip_mps - def test_attention_slicing_forward_pass(self): - test_max_difference = torch_device == "cpu" - - # Check is relaxed because there is not a torch 2.0 sliced attention added kv processor - expected_max_diff = 1e-2 - - self._test_attention_slicing_forward_pass( - test_max_difference=test_max_difference, expected_max_diff=expected_max_diff - ) - - # Overriding PipelineTesterMixin::test_inference_batch_single_identical - # because UnCLIP undeterminism requires a looser check. - @unittest.skip("UnCLIP produces very large differences. Test is not useful.") - @skip_mps - def test_inference_batch_single_identical(self): - additional_params_copy_to_batched_inputs = [ - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - self._test_inference_batch_single_identical( - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=5e-3 - ) - - def test_inference_batch_consistent(self): - additional_params_copy_to_batched_inputs = [ - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - - if torch_device == "mps": - # TODO: MPS errors with larger batch sizes - batch_sizes = [2, 3] - self._test_inference_batch_consistent( - batch_sizes=batch_sizes, - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, - ) - else: - self._test_inference_batch_consistent( - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs - ) - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @unittest.skip("UnCLIP produces very large difference. Test is not useful.") - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local(expected_max_difference=4e-3) - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - @unittest.skip("UnCLIP produces very large difference in fp16 vs fp32. Test is not useful.") - def test_float16_inference(self): - super().test_float16_inference(expected_max_diff=1.0) - - -@nightly -@require_torch_accelerator -class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def test_unclip_image_variation_karlo(self): - input_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/unclip/karlo_v1_alpha_cat_variation_fp16.npy" - ) - - pipeline = UnCLIPImageVariationPipeline.from_pretrained( - "kakaobrain/karlo-v1-alpha-image-variations", torch_dtype=torch.float16 - ) - pipeline = pipeline.to(torch_device) - pipeline.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipeline( - input_image, - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - - assert_mean_pixel_difference(image, expected_image, 15) diff --git a/tests/pipelines/unidiffuser/__init__.py b/tests/pipelines/unidiffuser/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py deleted file mode 100644 index dccb1a85008b..000000000000 --- a/tests/pipelines/unidiffuser/test_unidiffuser.py +++ /dev/null @@ -1,764 +0,0 @@ -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import ( - CLIPImageProcessor, - CLIPTextModel, - CLIPTokenizer, - CLIPVisionModelWithProjection, - GPT2Tokenizer, -) - -from diffusers import ( - AutoencoderKL, - DPMSolverMultistepScheduler, - UniDiffuserModel, - UniDiffuserPipeline, - UniDiffuserTextDecoder, -) -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - floats_tensor, - load_image, - nightly, - require_torch_accelerator, - torch_device, -) -from diffusers.utils.torch_utils import randn_tensor - -from ..pipeline_params import ( - IMAGE_TO_IMAGE_IMAGE_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS, -) -from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin - - -enable_full_determinism() - - -class UniDiffuserPipelineFastTests( - PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase -): - pipeline_class = UniDiffuserPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS - # vae_latents, not latents, is the argument that corresponds to VAE latent inputs - image_latents_params = frozenset(["vae_latents"]) - - supports_dduf = False - - def get_dummy_components(self): - unet = UniDiffuserModel.from_pretrained( - "hf-internal-testing/unidiffuser-diffusers-test", - subfolder="unet", - ) - - scheduler = DPMSolverMultistepScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - solver_order=3, - ) - - vae = AutoencoderKL.from_pretrained( - "hf-internal-testing/unidiffuser-diffusers-test", - subfolder="vae", - ) - - text_encoder = CLIPTextModel.from_pretrained( - "hf-internal-testing/unidiffuser-diffusers-test", - subfolder="text_encoder", - ) - clip_tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/unidiffuser-diffusers-test", - subfolder="clip_tokenizer", - ) - - image_encoder = CLIPVisionModelWithProjection.from_pretrained( - "hf-internal-testing/unidiffuser-diffusers-test", - subfolder="image_encoder", - ) - # From the Stable Diffusion Image Variation pipeline tests - clip_image_processor = CLIPImageProcessor(crop_size=32, size=32) - # image_processor = CLIPImageProcessor.from_pretrained("hf-internal-testing/tiny-random-clip") - - text_tokenizer = GPT2Tokenizer.from_pretrained( - "hf-internal-testing/unidiffuser-diffusers-test", - subfolder="text_tokenizer", - ) - text_decoder = UniDiffuserTextDecoder.from_pretrained( - "hf-internal-testing/unidiffuser-diffusers-test", - subfolder="text_decoder", - ) - - components = { - "vae": vae, - "text_encoder": text_encoder, - "image_encoder": image_encoder, - "clip_image_processor": clip_image_processor, - "clip_tokenizer": clip_tokenizer, - "text_decoder": text_decoder, - "text_tokenizer": text_tokenizer, - "unet": unet, - "scheduler": scheduler, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - image = Image.fromarray(np.uint8(image)).convert("RGB") - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "an elephant under the sea", - "image": image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "np", - } - return inputs - - def get_fixed_latents(self, device, seed=0): - if isinstance(device, str): - device = torch.device(device) - generator = torch.Generator(device=device).manual_seed(seed) - # Hardcode the shapes for now. - prompt_latents = randn_tensor((1, 77, 32), generator=generator, device=device, dtype=torch.float32) - vae_latents = randn_tensor((1, 4, 16, 16), generator=generator, device=device, dtype=torch.float32) - clip_latents = randn_tensor((1, 1, 32), generator=generator, device=device, dtype=torch.float32) - - latents = { - "prompt_latents": prompt_latents, - "vae_latents": vae_latents, - "clip_latents": clip_latents, - } - return latents - - def get_dummy_inputs_with_latents(self, device, seed=0): - # image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - # image = image.cpu().permute(0, 2, 3, 1)[0] - # image = Image.fromarray(np.uint8(image)).convert("RGB") - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg", - ) - image = image.resize((32, 32)) - latents = self.get_fixed_latents(device, seed=seed) - - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - inputs = { - "prompt": "an elephant under the sea", - "image": image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "np", - "prompt_latents": latents.get("prompt_latents"), - "vae_latents": latents.get("vae_latents"), - "clip_latents": latents.get("clip_latents"), - } - return inputs - - def test_dict_tuple_outputs_equivalent(self): - expected_slice = None - if torch_device == "cpu": - expected_slice = np.array([0.7489, 0.3722, 0.4475, 0.5630, 0.5923, 0.4992, 0.3936, 0.5844, 0.4975]) - super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) - - def test_unidiffuser_default_joint_v0(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'joint' - unidiffuser_pipe.set_joint_mode() - assert unidiffuser_pipe.mode == "joint" - - # inputs = self.get_dummy_inputs(device) - inputs = self.get_dummy_inputs_with_latents(device) - # Delete prompt and image for joint inference. - del inputs["prompt"] - del inputs["image"] - sample = unidiffuser_pipe(**inputs) - image = sample.images - text = sample.text - assert image.shape == (1, 32, 32, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716]) - assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 - - expected_text_prefix = " no no no " - assert text[0][:10] == expected_text_prefix - - def test_unidiffuser_default_joint_no_cfg_v0(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'joint' - unidiffuser_pipe.set_joint_mode() - assert unidiffuser_pipe.mode == "joint" - - # inputs = self.get_dummy_inputs(device) - inputs = self.get_dummy_inputs_with_latents(device) - # Delete prompt and image for joint inference. - del inputs["prompt"] - del inputs["image"] - # Set guidance scale to 1.0 to turn off CFG - inputs["guidance_scale"] = 1.0 - sample = unidiffuser_pipe(**inputs) - image = sample.images - text = sample.text - assert image.shape == (1, 32, 32, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716]) - assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 - - expected_text_prefix = " no no no " - assert text[0][:10] == expected_text_prefix - - def test_unidiffuser_default_text2img_v0(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'text2img' - unidiffuser_pipe.set_text_to_image_mode() - assert unidiffuser_pipe.mode == "text2img" - - inputs = self.get_dummy_inputs_with_latents(device) - # Delete image for text-conditioned image generation - del inputs["image"] - image = unidiffuser_pipe(**inputs).images - assert image.shape == (1, 32, 32, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_unidiffuser_default_image_0(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'img' - unidiffuser_pipe.set_image_mode() - assert unidiffuser_pipe.mode == "img" - - inputs = self.get_dummy_inputs(device) - # Delete prompt and image for unconditional ("marginal") text generation. - del inputs["prompt"] - del inputs["image"] - image = unidiffuser_pipe(**inputs).images - assert image.shape == (1, 32, 32, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.5760, 0.6270, 0.6571, 0.4966, 0.4638, 0.5663, 0.5254, 0.5068, 0.5715]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_unidiffuser_default_text_v0(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'img' - unidiffuser_pipe.set_text_mode() - assert unidiffuser_pipe.mode == "text" - - inputs = self.get_dummy_inputs(device) - # Delete prompt and image for unconditional ("marginal") text generation. - del inputs["prompt"] - del inputs["image"] - text = unidiffuser_pipe(**inputs).text - - expected_text_prefix = " no no no " - assert text[0][:10] == expected_text_prefix - - def test_unidiffuser_default_img2text_v0(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'img2text' - unidiffuser_pipe.set_image_to_text_mode() - assert unidiffuser_pipe.mode == "img2text" - - inputs = self.get_dummy_inputs_with_latents(device) - # Delete text for image-conditioned text generation - del inputs["prompt"] - text = unidiffuser_pipe(**inputs).text - - expected_text_prefix = " no no no " - assert text[0][:10] == expected_text_prefix - - def test_unidiffuser_default_joint_v1(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1") - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'joint' - unidiffuser_pipe.set_joint_mode() - assert unidiffuser_pipe.mode == "joint" - - # inputs = self.get_dummy_inputs(device) - inputs = self.get_dummy_inputs_with_latents(device) - # Delete prompt and image for joint inference. - del inputs["prompt"] - del inputs["image"] - inputs["data_type"] = 1 - sample = unidiffuser_pipe(**inputs) - image = sample.images - text = sample.text - assert image.shape == (1, 32, 32, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716]) - assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 - - expected_text_prefix = " no no no " - assert text[0][:10] == expected_text_prefix - - def test_unidiffuser_default_text2img_v1(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1") - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'text2img' - unidiffuser_pipe.set_text_to_image_mode() - assert unidiffuser_pipe.mode == "text2img" - - inputs = self.get_dummy_inputs_with_latents(device) - # Delete image for text-conditioned image generation - del inputs["image"] - image = unidiffuser_pipe(**inputs).images - assert image.shape == (1, 32, 32, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_unidiffuser_default_img2text_v1(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1") - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'img2text' - unidiffuser_pipe.set_image_to_text_mode() - assert unidiffuser_pipe.mode == "img2text" - - inputs = self.get_dummy_inputs_with_latents(device) - # Delete text for image-conditioned text generation - del inputs["prompt"] - text = unidiffuser_pipe(**inputs).text - - expected_text_prefix = " no no no " - assert text[0][:10] == expected_text_prefix - - def test_unidiffuser_text2img_multiple_images(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'text2img' - unidiffuser_pipe.set_text_to_image_mode() - assert unidiffuser_pipe.mode == "text2img" - - inputs = self.get_dummy_inputs(device) - # Delete image for text-conditioned image generation - del inputs["image"] - inputs["num_images_per_prompt"] = 2 - inputs["num_prompts_per_image"] = 3 - image = unidiffuser_pipe(**inputs).images - assert image.shape == (2, 32, 32, 3) - - def test_unidiffuser_img2text_multiple_prompts(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'img2text' - unidiffuser_pipe.set_image_to_text_mode() - assert unidiffuser_pipe.mode == "img2text" - - inputs = self.get_dummy_inputs(device) - # Delete text for image-conditioned text generation - del inputs["prompt"] - inputs["num_images_per_prompt"] = 2 - inputs["num_prompts_per_image"] = 3 - text = unidiffuser_pipe(**inputs).text - - assert len(text) == 3 - - def test_unidiffuser_text2img_multiple_images_with_latents(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'text2img' - unidiffuser_pipe.set_text_to_image_mode() - assert unidiffuser_pipe.mode == "text2img" - - inputs = self.get_dummy_inputs_with_latents(device) - # Delete image for text-conditioned image generation - del inputs["image"] - inputs["num_images_per_prompt"] = 2 - inputs["num_prompts_per_image"] = 3 - image = unidiffuser_pipe(**inputs).images - assert image.shape == (2, 32, 32, 3) - - def test_unidiffuser_img2text_multiple_prompts_with_latents(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - unidiffuser_pipe = UniDiffuserPipeline(**components) - unidiffuser_pipe = unidiffuser_pipe.to(device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'img2text' - unidiffuser_pipe.set_image_to_text_mode() - assert unidiffuser_pipe.mode == "img2text" - - inputs = self.get_dummy_inputs_with_latents(device) - # Delete text for image-conditioned text generation - del inputs["prompt"] - inputs["num_images_per_prompt"] = 2 - inputs["num_prompts_per_image"] = 3 - text = unidiffuser_pipe(**inputs).text - - assert len(text) == 3 - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(expected_max_diff=2e-4) - - @require_torch_accelerator - def test_unidiffuser_default_joint_v1_fp16(self): - unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( - "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 - ) - unidiffuser_pipe = unidiffuser_pipe.to(torch_device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'joint' - unidiffuser_pipe.set_joint_mode() - assert unidiffuser_pipe.mode == "joint" - - inputs = self.get_dummy_inputs_with_latents(torch_device) - # Delete prompt and image for joint inference. - del inputs["prompt"] - del inputs["image"] - inputs["data_type"] = 1 - sample = unidiffuser_pipe(**inputs) - image = sample.images - text = sample.text - assert image.shape == (1, 32, 32, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_img_slice = np.array([0.5049, 0.5498, 0.5854, 0.3052, 0.4460, 0.6489, 0.5122, 0.4810, 0.6138]) - assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 - - expected_text_prefix = '" This This' - assert text[0][: len(expected_text_prefix)] == expected_text_prefix - - @require_torch_accelerator - def test_unidiffuser_default_text2img_v1_fp16(self): - unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( - "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 - ) - unidiffuser_pipe = unidiffuser_pipe.to(torch_device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'text2img' - unidiffuser_pipe.set_text_to_image_mode() - assert unidiffuser_pipe.mode == "text2img" - - inputs = self.get_dummy_inputs_with_latents(torch_device) - # Delete prompt and image for joint inference. - del inputs["image"] - inputs["data_type"] = 1 - sample = unidiffuser_pipe(**inputs) - image = sample.images - assert image.shape == (1, 32, 32, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_img_slice = np.array([0.5054, 0.5498, 0.5854, 0.3052, 0.4458, 0.6489, 0.5122, 0.4810, 0.6138]) - assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 - - @require_torch_accelerator - def test_unidiffuser_default_img2text_v1_fp16(self): - unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( - "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 - ) - unidiffuser_pipe = unidiffuser_pipe.to(torch_device) - unidiffuser_pipe.set_progress_bar_config(disable=None) - - # Set mode to 'img2text' - unidiffuser_pipe.set_image_to_text_mode() - assert unidiffuser_pipe.mode == "img2text" - - inputs = self.get_dummy_inputs_with_latents(torch_device) - # Delete prompt and image for joint inference. - del inputs["prompt"] - inputs["data_type"] = 1 - text = unidiffuser_pipe(**inputs).text - - expected_text_prefix = '" This This' - assert text[0][: len(expected_text_prefix)] == expected_text_prefix - - @unittest.skip( - "Test not supported because it has a bunch of direct configs at init and also, this pipeline isn't used that much now." - ) - def test_encode_prompt_works_in_isolation(): - pass - - -@nightly -@require_torch_accelerator -class UniDiffuserPipelineSlowTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_inputs(self, device, seed=0, generate_latents=False): - generator = torch.manual_seed(seed) - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg" - ) - inputs = { - "prompt": "an elephant under the sea", - "image": image, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 8.0, - "output_type": "np", - } - if generate_latents: - latents = self.get_fixed_latents(device, seed=seed) - for latent_name, latent_tensor in latents.items(): - inputs[latent_name] = latent_tensor - return inputs - - def get_fixed_latents(self, device, seed=0): - if isinstance(device, str): - device = torch.device(device) - latent_device = torch.device("cpu") - generator = torch.Generator(device=latent_device).manual_seed(seed) - # Hardcode the shapes for now. - prompt_latents = randn_tensor((1, 77, 768), generator=generator, device=device, dtype=torch.float32) - vae_latents = randn_tensor((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float32) - clip_latents = randn_tensor((1, 1, 512), generator=generator, device=device, dtype=torch.float32) - - # Move latents onto desired device. - prompt_latents = prompt_latents.to(device) - vae_latents = vae_latents.to(device) - clip_latents = clip_latents.to(device) - - latents = { - "prompt_latents": prompt_latents, - "vae_latents": vae_latents, - "clip_latents": clip_latents, - } - return latents - - def test_unidiffuser_default_joint_v1(self): - pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1") - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - # inputs = self.get_dummy_inputs(device) - inputs = self.get_inputs(device=torch_device, generate_latents=True) - # Delete prompt and image for joint inference. - del inputs["prompt"] - del inputs["image"] - sample = pipe(**inputs) - image = sample.images - text = sample.text - assert image.shape == (1, 512, 512, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520]) - assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-1 - - expected_text_prefix = "a living room" - assert text[0][: len(expected_text_prefix)] == expected_text_prefix - - def test_unidiffuser_default_text2img_v1(self): - pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1") - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(device=torch_device, generate_latents=True) - del inputs["image"] - sample = pipe(**inputs) - image = sample.images - assert image.shape == (1, 512, 512, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - def test_unidiffuser_default_img2text_v1(self): - pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1") - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(device=torch_device, generate_latents=True) - del inputs["prompt"] - sample = pipe(**inputs) - text = sample.text - - expected_text_prefix = "An astronaut" - assert text[0][: len(expected_text_prefix)] == expected_text_prefix - - -@nightly -@require_torch_accelerator -class UniDiffuserPipelineNightlyTests(unittest.TestCase): - def setUp(self): - super().setUp() - gc.collect() - backend_empty_cache(torch_device) - - def tearDown(self): - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_inputs(self, device, seed=0, generate_latents=False): - generator = torch.manual_seed(seed) - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg" - ) - inputs = { - "prompt": "an elephant under the sea", - "image": image, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 8.0, - "output_type": "np", - } - if generate_latents: - latents = self.get_fixed_latents(device, seed=seed) - for latent_name, latent_tensor in latents.items(): - inputs[latent_name] = latent_tensor - return inputs - - def get_fixed_latents(self, device, seed=0): - if isinstance(device, str): - device = torch.device(device) - latent_device = torch.device("cpu") - generator = torch.Generator(device=latent_device).manual_seed(seed) - # Hardcode the shapes for now. - prompt_latents = randn_tensor((1, 77, 768), generator=generator, device=device, dtype=torch.float32) - vae_latents = randn_tensor((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float32) - clip_latents = randn_tensor((1, 1, 512), generator=generator, device=device, dtype=torch.float32) - - # Move latents onto desired device. - prompt_latents = prompt_latents.to(device) - vae_latents = vae_latents.to(device) - clip_latents = clip_latents.to(device) - - latents = { - "prompt_latents": prompt_latents, - "vae_latents": vae_latents, - "clip_latents": clip_latents, - } - return latents - - def test_unidiffuser_default_joint_v1_fp16(self): - pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - # inputs = self.get_dummy_inputs(device) - inputs = self.get_inputs(device=torch_device, generate_latents=True) - # Delete prompt and image for joint inference. - del inputs["prompt"] - del inputs["image"] - sample = pipe(**inputs) - image = sample.images - text = sample.text - assert image.shape == (1, 512, 512, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520]) - assert np.abs(image_slice.flatten() - expected_img_slice).max() < 2e-1 - - expected_text_prefix = "a living room" - assert text[0][: len(expected_text_prefix)] == expected_text_prefix - - def test_unidiffuser_default_text2img_v1_fp16(self): - pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(device=torch_device, generate_latents=True) - del inputs["image"] - sample = pipe(**inputs) - image = sample.images - assert image.shape == (1, 512, 512, 3) - - image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - def test_unidiffuser_default_img2text_v1_fp16(self): - pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(device=torch_device, generate_latents=True) - del inputs["prompt"] - sample = pipe(**inputs) - text = sample.text - - expected_text_prefix = "An astronaut" - assert text[0][: len(expected_text_prefix)] == expected_text_prefix diff --git a/tests/pipelines/wuerstchen/__init__.py b/tests/pipelines/wuerstchen/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py deleted file mode 100644 index 060a11434ecb..000000000000 --- a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py +++ /dev/null @@ -1,241 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import DDPMWuerstchenScheduler, WuerstchenCombinedPipeline -from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt, WuerstchenPrior -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device - -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class WuerstchenCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = WuerstchenCombinedPipeline - params = ["prompt"] - batch_params = ["prompt", "negative_prompt"] - required_optional_params = [ - "generator", - "height", - "width", - "latents", - "prior_guidance_scale", - "decoder_guidance_scale", - "negative_prompt", - "num_inference_steps", - "return_dict", - "prior_num_inference_steps", - "output_type", - ] - test_xformers_attention = True - - @property - def text_embedder_hidden_size(self): - return 32 - - @property - def dummy_prior(self): - torch.manual_seed(0) - - model_kwargs = {"c_in": 2, "c": 8, "depth": 2, "c_cond": 32, "c_r": 8, "nhead": 2} - model = WuerstchenPrior(**model_kwargs) - return model.eval() - - @property - def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - return tokenizer - - @property - def dummy_prior_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=self.text_embedder_hidden_size, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config).eval() - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - projection_dim=self.text_embedder_hidden_size, - hidden_size=self.text_embedder_hidden_size, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config).eval() - - @property - def dummy_vqgan(self): - torch.manual_seed(0) - - model_kwargs = { - "bottleneck_blocks": 1, - "num_vq_embeddings": 2, - } - model = PaellaVQModel(**model_kwargs) - return model.eval() - - @property - def dummy_decoder(self): - torch.manual_seed(0) - - model_kwargs = { - "c_cond": self.text_embedder_hidden_size, - "c_hidden": [320], - "nhead": [-1], - "blocks": [4], - "level_config": ["CT"], - "clip_embd": self.text_embedder_hidden_size, - "inject_effnet": [False], - } - - model = WuerstchenDiffNeXt(**model_kwargs) - return model.eval() - - def get_dummy_components(self): - prior = self.dummy_prior - prior_text_encoder = self.dummy_prior_text_encoder - - scheduler = DDPMWuerstchenScheduler() - tokenizer = self.dummy_tokenizer - - text_encoder = self.dummy_text_encoder - decoder = self.dummy_decoder - vqgan = self.dummy_vqgan - - components = { - "tokenizer": tokenizer, - "text_encoder": text_encoder, - "decoder": decoder, - "vqgan": vqgan, - "scheduler": scheduler, - "prior_prior": prior, - "prior_text_encoder": prior_text_encoder, - "prior_tokenizer": tokenizer, - "prior_scheduler": scheduler, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "horse", - "generator": generator, - "prior_guidance_scale": 4.0, - "decoder_guidance_scale": 4.0, - "num_inference_steps": 2, - "prior_num_inference_steps": 2, - "output_type": "np", - "height": 128, - "width": 128, - } - return inputs - - def test_wuerstchen(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(device)) - image = output.images - - image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[-3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - - expected_slice = np.array([0.7616304, 0.0, 1.0, 0.0, 1.0, 0.0, 0.05925313, 0.0, 0.951898]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, ( - f" expected_slice {expected_slice}, but got {image_slice.flatten()}" - ) - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2, ( - f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" - ) - - @require_torch_accelerator - def test_offloads(self): - pipes = [] - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components).to(torch_device) - pipes.append(sd_pipe) - - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload(device=torch_device) - pipes.append(sd_pipe) - - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload(device=torch_device) - pipes.append(sd_pipe) - - image_slices = [] - for pipe in pipes: - inputs = self.get_dummy_inputs(torch_device) - image = pipe(**inputs).images - - image_slices.append(image[0, -3:, -3:, -1].flatten()) - - assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3 - assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3 - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(expected_max_diff=1e-2) - - @unittest.skip(reason="flakey and float16 requires CUDA") - def test_float16_inference(self): - super().test_float16_inference() - - @unittest.skip(reason="Test not supported.") - def test_callback_inputs(self): - pass - - @unittest.skip(reason="Test not supported.") - def test_callback_cfg(self): - pass diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py b/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py deleted file mode 100644 index 5d2462d48d8b..000000000000 --- a/tests/pipelines/wuerstchen/test_wuerstchen_decoder.py +++ /dev/null @@ -1,192 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import DDPMWuerstchenScheduler, WuerstchenDecoderPipeline -from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt -from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device - -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class WuerstchenDecoderPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = WuerstchenDecoderPipeline - params = ["prompt"] - batch_params = ["image_embeddings", "prompt", "negative_prompt"] - required_optional_params = [ - "num_images_per_prompt", - "num_inference_steps", - "latents", - "negative_prompt", - "guidance_scale", - "output_type", - "return_dict", - ] - test_xformers_attention = False - callback_cfg_params = ["image_embeddings", "text_encoder_hidden_states"] - - @property - def text_embedder_hidden_size(self): - return 32 - - @property - def time_input_dim(self): - return 32 - - @property - def block_out_channels_0(self): - return self.time_input_dim - - @property - def time_embed_dim(self): - return self.time_input_dim * 4 - - @property - def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - return tokenizer - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - projection_dim=self.text_embedder_hidden_size, - hidden_size=self.text_embedder_hidden_size, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config).eval() - - @property - def dummy_vqgan(self): - torch.manual_seed(0) - - model_kwargs = { - "bottleneck_blocks": 1, - "num_vq_embeddings": 2, - } - model = PaellaVQModel(**model_kwargs) - return model.eval() - - @property - def dummy_decoder(self): - torch.manual_seed(0) - - model_kwargs = { - "c_cond": self.text_embedder_hidden_size, - "c_hidden": [320], - "nhead": [-1], - "blocks": [4], - "level_config": ["CT"], - "clip_embd": self.text_embedder_hidden_size, - "inject_effnet": [False], - } - - model = WuerstchenDiffNeXt(**model_kwargs) - return model.eval() - - def get_dummy_components(self): - decoder = self.dummy_decoder - text_encoder = self.dummy_text_encoder - tokenizer = self.dummy_tokenizer - vqgan = self.dummy_vqgan - - scheduler = DDPMWuerstchenScheduler() - - components = { - "decoder": decoder, - "vqgan": vqgan, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "scheduler": scheduler, - "latent_dim_scale": 4.0, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "image_embeddings": torch.ones((1, 4, 4, 4), device=device), - "prompt": "horse", - "generator": generator, - "guidance_scale": 1.0, - "num_inference_steps": 2, - "output_type": "np", - } - return inputs - - def test_wuerstchen_decoder(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(device)) - image = output.images - - image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False) - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.0000, 0.0000, 0.0089, 1.0000, 1.0000, 0.3927, 1.0000, 1.0000, 1.0000]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - @skip_mps - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=1e-5) - - @skip_mps - def test_attention_slicing_forward_pass(self): - test_max_difference = torch_device == "cpu" - test_mean_pixel_difference = False - - self._test_attention_slicing_forward_pass( - test_max_difference=test_max_difference, - test_mean_pixel_difference=test_mean_pixel_difference, - ) - - @unittest.skip(reason="bf16 not supported and requires CUDA") - def test_float16_inference(self): - super().test_float16_inference() - - @unittest.skip("Test not supported.") - def test_encode_prompt_works_in_isolation(self): - super().test_encode_prompt_works_in_isolation() diff --git a/tests/pipelines/wuerstchen/test_wuerstchen_prior.py b/tests/pipelines/wuerstchen/test_wuerstchen_prior.py deleted file mode 100644 index 34f7c684b7d8..000000000000 --- a/tests/pipelines/wuerstchen/test_wuerstchen_prior.py +++ /dev/null @@ -1,273 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import DDPMWuerstchenScheduler, WuerstchenPriorPipeline -from diffusers.pipelines.wuerstchen import WuerstchenPrior -from diffusers.utils.import_utils import is_peft_available -from diffusers.utils.testing_utils import enable_full_determinism, require_peft_backend, skip_mps, torch_device - - -if is_peft_available(): - from peft import LoraConfig - from peft.tuners.tuners_utils import BaseTunerLayer - -from ..test_pipelines_common import PipelineTesterMixin - - -enable_full_determinism() - - -class WuerstchenPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = WuerstchenPriorPipeline - params = ["prompt"] - batch_params = ["prompt", "negative_prompt"] - required_optional_params = [ - "num_images_per_prompt", - "generator", - "num_inference_steps", - "latents", - "negative_prompt", - "guidance_scale", - "output_type", - "return_dict", - ] - test_xformers_attention = False - callback_cfg_params = ["text_encoder_hidden_states"] - - @property - def text_embedder_hidden_size(self): - return 32 - - @property - def time_input_dim(self): - return 32 - - @property - def block_out_channels_0(self): - return self.time_input_dim - - @property - def time_embed_dim(self): - return self.time_input_dim * 4 - - @property - def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - return tokenizer - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=self.text_embedder_hidden_size, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config).eval() - - @property - def dummy_prior(self): - torch.manual_seed(0) - - model_kwargs = { - "c_in": 2, - "c": 8, - "depth": 2, - "c_cond": 32, - "c_r": 8, - "nhead": 2, - } - - model = WuerstchenPrior(**model_kwargs) - return model.eval() - - def get_dummy_components(self): - prior = self.dummy_prior - text_encoder = self.dummy_text_encoder - tokenizer = self.dummy_tokenizer - - scheduler = DDPMWuerstchenScheduler() - - components = { - "prior": prior, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "scheduler": scheduler, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "horse", - "generator": generator, - "guidance_scale": 4.0, - "num_inference_steps": 2, - "output_type": "np", - } - return inputs - - def test_wuerstchen_prior(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(device)) - image = output.image_embeddings - - image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0] - - image_slice = image[0, 0, 0, -10:] - image_from_tuple_slice = image_from_tuple[0, 0, 0, -10:] - assert image.shape == (1, 2, 24, 24) - - expected_slice = np.array( - [ - -7172.837, - -3438.855, - -1093.312, - 388.8835, - -7471.467, - -7998.1206, - -5328.259, - 218.00089, - -2731.5745, - -8056.734, - ] - ) - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-2 - - @skip_mps - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical( - expected_max_diff=3e-1, - ) - - @skip_mps - def test_attention_slicing_forward_pass(self): - test_max_difference = torch_device == "cpu" - test_mean_pixel_difference = False - - self._test_attention_slicing_forward_pass( - test_max_difference=test_max_difference, - test_mean_pixel_difference=test_mean_pixel_difference, - ) - - @unittest.skip(reason="flaky for now") - def test_float16_inference(self): - super().test_float16_inference() - - # override because we need to make sure latent_mean and latent_std to be 0 - def test_callback_inputs(self): - components = self.get_dummy_components() - components["latent_mean"] = 0 - components["latent_std"] = 0 - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - self.assertTrue( - hasattr(pipe, "_callback_tensor_inputs"), - f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs", - ) - - def callback_inputs_test(pipe, i, t, callback_kwargs): - missing_callback_inputs = set() - for v in pipe._callback_tensor_inputs: - if v not in callback_kwargs: - missing_callback_inputs.add(v) - self.assertTrue( - len(missing_callback_inputs) == 0, f"Missing callback tensor inputs: {missing_callback_inputs}" - ) - last_i = pipe.num_timesteps - 1 - if i == last_i: - callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"]) - return callback_kwargs - - inputs = self.get_dummy_inputs(torch_device) - inputs["callback_on_step_end"] = callback_inputs_test - inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs - inputs["output_type"] = "latent" - - output = pipe(**inputs)[0] - assert output.abs().sum() == 0 - - def check_if_lora_correctly_set(self, model) -> bool: - """ - Checks if the LoRA layers are correctly set with peft - """ - for module in model.modules(): - if isinstance(module, BaseTunerLayer): - return True - return False - - def get_lora_components(self): - prior = self.dummy_prior - - prior_lora_config = LoraConfig( - r=4, lora_alpha=4, target_modules=["to_q", "to_k", "to_v", "to_out.0"], init_lora_weights=False - ) - - return prior, prior_lora_config - - @require_peft_backend - def test_inference_with_prior_lora(self): - _, prior_lora_config = self.get_lora_components() - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - output_no_lora = pipe(**self.get_dummy_inputs(device)) - image_embed = output_no_lora.image_embeddings - self.assertTrue(image_embed.shape == (1, 2, 24, 24)) - - pipe.prior.add_adapter(prior_lora_config) - self.assertTrue(self.check_if_lora_correctly_set(pipe.prior), "Lora not correctly set in prior") - - output_lora = pipe(**self.get_dummy_inputs(device)) - lora_image_embed = output_lora.image_embeddings - - self.assertTrue(image_embed.shape == lora_image_embed.shape) - - @unittest.skip("Test not supported as dtype cannot be inferred without the text encoder otherwise.") - def test_encode_prompt_works_in_isolation(self): - pass