Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove CogVideoX mentions from single file docs; Test updates #9444

Merged
merged 4 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions docs/source/en/api/loaders/single_file.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:

## Supported pipelines

- [`CogVideoXPipeline`]
- [`CogVideoXImageToVideoPipeline`]
- [`CogVideoXVideoToVideoPipeline`]
- [`StableDiffusionPipeline`]
- [`StableDiffusionImg2ImgPipeline`]
- [`StableDiffusionInpaintPipeline`]
Expand Down Expand Up @@ -52,7 +49,6 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:
- [`UNet2DConditionModel`]
- [`StableCascadeUNet`]
- [`AutoencoderKL`]
- [`AutoencoderKLCogVideoX`]
- [`ControlNetModel`]
- [`SD3Transformer2DModel`]
- [`FluxTransformer2DModel`]
Expand Down
17 changes: 11 additions & 6 deletions tests/pipelines/cogvideo/test_cogvideox.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
"callback_on_step_end_tensor_inputs",
]
)
test_xformers_attention = False

def get_dummy_components(self):
torch.manual_seed(0)
Expand All @@ -71,8 +72,8 @@ def get_dummy_components(self):
time_embed_dim=2,
text_embed_dim=32, # Must match with tiny-random-t5
num_layers=1,
sample_width=16, # latent width: 2 -> final width: 16
sample_height=16, # latent height: 2 -> final height: 16
sample_width=2, # latent width: 2 -> final width: 16
sample_height=2, # latent height: 2 -> final height: 16
sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
patch_size=2,
temporal_compression_ratio=4,
Expand Down Expand Up @@ -254,6 +255,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
generator_device = "cpu"
components = self.get_dummy_components()

# Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly
# due to them being static
# components["transformer"] = CogVideoXTransformer3DModel.from_config(
# components["transformer"].config,
# sample_height=16,
# sample_width=16,
# )

pipe = self.pipeline_class(**components)
pipe.to("cpu")
pipe.set_progress_bar_config(disable=None)
Expand All @@ -280,10 +289,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
"VAE tiling should not affect the inference results",
)

@unittest.skip("xformers attention processor does not exist for CogVideoX")
def test_xformers_attention_forwardGenerator_pass(self):
pass

def test_fused_qkv_projections(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
Expand Down
5 changes: 3 additions & 2 deletions tests/pipelines/cogvideo/test_cogvideox_image2video.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,9 @@ def test_vae_tiling(self, expected_diff_max: float = 0.3):
generator_device = "cpu"
components = self.get_dummy_components()

# The reason to modify it this way is because I2V Transformer limits the generation to resolutions.
# See the if-statement on "self.use_learned_positional_embeddings"
# The reason to modify it this way is because I2V Transformer limits the generation to resolutions used during initalization.
# This limitation comes from using learned positional embeddings which cannot be generated on-the-fly like sincos or RoPE embeddings.
# See the if-statement on "self.use_learned_positional_embeddings" in diffusers/models/embeddings.py
components["transformer"] = CogVideoXTransformer3DModel.from_config(
components["transformer"].config,
sample_height=16,
Expand Down
17 changes: 11 additions & 6 deletions tests/pipelines/cogvideo/test_cogvideox_video2video.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
"callback_on_step_end_tensor_inputs",
]
)
test_xformers_attention = False

def get_dummy_components(self):
torch.manual_seed(0)
Expand All @@ -65,8 +66,8 @@ def get_dummy_components(self):
time_embed_dim=2,
text_embed_dim=32, # Must match with tiny-random-t5
num_layers=1,
sample_width=16, # latent width: 2 -> final width: 16
sample_height=16, # latent height: 2 -> final height: 16
sample_width=2, # latent width: 2 -> final width: 16
sample_height=2, # latent height: 2 -> final height: 16
sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
patch_size=2,
temporal_compression_ratio=4,
Expand Down Expand Up @@ -259,6 +260,14 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
generator_device = "cpu"
components = self.get_dummy_components()

# Unlike the ImageToVideo test, this is not needed here because positional embeds can be generated on-the-fly
# due to them being static
# components["transformer"] = CogVideoXTransformer3DModel.from_config(
# components["transformer"].config,
# sample_height=16,
# sample_width=16,
# )

pipe = self.pipeline_class(**components)
pipe.to("cpu")
pipe.set_progress_bar_config(disable=None)
Expand All @@ -285,10 +294,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
"VAE tiling should not affect the inference results",
)

@unittest.skip("xformers attention processor does not exist for CogVideoX")
def test_xformers_attention_forwardGenerator_pass(self):
pass

def test_fused_qkv_projections(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
Expand Down
Loading