Merge branch 'main' into enable-hotswap-testing-ci

sayakpaul · web-flow · commit 24a0374f79c4 · 2025-04-16T18:56:50.000+05:30
diff --git a/docs/source/en/api/pipelines/aura_flow.md b/docs/source/en/api/pipelines/aura_flow.md
@@ -89,6 +89,21 @@ image = pipeline(prompt).images[0]
 image.save("auraflow.png")
 ```
 
+## Support for `torch.compile()`
+
+AuraFlow can be compiled with `torch.compile()` to speed up inference latency even for different resolutions. First, install PyTorch nightly following the instructions from [here](https://pytorch.org/). The snippet below shows the changes needed to enable this:
+
+```diff
++ torch.fx.experimental._config.use_duck_shape = False
++ pipeline.transformer = torch.compile(
+    pipeline.transformer, fullgraph=True, dynamic=True
+)
+```
+
+This enables from 100% (on low resolutions) to a 30% (on 1536x1536 resolution) speed improvements.
+
+Thanks to [AstraliteHeart](https://github.com/huggingface/diffusers/pull/11297/) who helped us rewrite the [`AuraFlowTransformer2DModel`] class so that the above works for different resolutions ([PR](https://github.com/huggingface/diffusers/pull/11297/)).
+
 ## AuraFlowPipeline
 
 [[autodoc]] AuraFlowPipeline
diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py
@@ -344,7 +344,7 @@ def _get_llama_prompt_embeds(
         )
         prompt_embeds = self.text_encoder(
             **expanded_inputs,
-            pixel_value=image_embeds,
+            pixel_values=image_embeds,
             output_hidden_states=True,
         ).hidden_states[-(num_hidden_layers_to_skip + 1)]
         prompt_embeds = prompt_embeds.to(dtype=dtype)
diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py
@@ -24,9 +24,11 @@
     CLIPTextModel,
     CLIPTokenizer,
     LlamaConfig,
-    LlamaModel,
-    LlamaTokenizer,
+    LlamaTokenizerFast,
+    LlavaConfig,
+    LlavaForConditionalGeneration,
 )
+from transformers.models.clip import CLIPVisionConfig
 
 from diffusers import (
     AutoencoderKLHunyuanVideo,
@@ -116,19 +118,29 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
         torch.manual_seed(0)
         scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
 
-        llama_text_encoder_config = LlamaConfig(
+        text_config = LlamaConfig(
             bos_token_id=0,
             eos_token_id=2,
             hidden_size=16,
             intermediate_size=37,
             layer_norm_eps=1e-05,
             num_attention_heads=4,
             num_hidden_layers=2,
-            pad_token_id=1,
+            pad_token_id=100,
             vocab_size=1000,
             hidden_act="gelu",
             projection_dim=32,
         )
+        vision_config = CLIPVisionConfig(
+            hidden_size=8,
+            intermediate_size=37,
+            projection_dim=32,
+            num_attention_heads=4,
+            num_hidden_layers=2,
+            image_size=224,
+        )
+        llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101)
+
         clip_text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
             eos_token_id=2,
@@ -144,23 +156,23 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
         )
 
         torch.manual_seed(0)
-        text_encoder = LlamaModel(llama_text_encoder_config)
-        tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
+        text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config)
+        tokenizer = LlamaTokenizerFast.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
 
         torch.manual_seed(0)
         text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
         tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         torch.manual_seed(0)
         image_processor = CLIPImageProcessor(
-            crop_size=336,
+            crop_size=224,
             do_center_crop=True,
             do_normalize=True,
             do_resize=True,
             image_mean=[0.48145466, 0.4578275, 0.40821073],
             image_std=[0.26862954, 0.26130258, 0.27577711],
             resample=3,
-            size=336,
+            size=224,
         )
 
         components = {
@@ -190,14 +202,18 @@ def get_dummy_inputs(self, device, seed=0):
             "prompt_template": {
                 "template": "{}",
                 "crop_start": 0,
+                "image_emb_len": 49,
+                "image_emb_start": 5,
+                "image_emb_end": 54,
+                "double_return_token_id": 0,
             },
             "generator": generator,
             "num_inference_steps": 2,
             "guidance_scale": 4.5,
             "height": image_height,
             "width": image_width,
             "num_frames": 9,
-            "max_sequence_length": 16,
+            "max_sequence_length": 64,
             "output_type": "pt",
         }
         return inputs

Original file line number	Diff line number	Diff line change
`@@ -344,7 +344,7 @@ def _get_llama_prompt_embeds(`
`344`	`344`	`)`
`345`	`345`	`prompt_embeds = self.text_encoder(`
`346`	`346`	`**expanded_inputs,`
`347`		`- pixel_value=image_embeds,`
	`347`	`+ pixel_values=image_embeds,`
`348`	`348`	`output_hidden_states=True,`
`349`	`349`	`).hidden_states[-(num_hidden_layers_to_skip + 1)]`
`350`	`350`	`prompt_embeds = prompt_embeds.to(dtype=dtype)`