Skip to content

Commit 24a0374

Browse files
authored
Merge branch 'main' into enable-hotswap-testing-ci
2 parents 714c458 + 59f1b7b commit 24a0374

File tree

3 files changed

+41
-10
lines changed

3 files changed

+41
-10
lines changed

docs/source/en/api/pipelines/aura_flow.md

+15
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,21 @@ image = pipeline(prompt).images[0]
8989
image.save("auraflow.png")
9090
```
9191

92+
## Support for `torch.compile()`
93+
94+
AuraFlow can be compiled with `torch.compile()` to speed up inference latency even for different resolutions. First, install PyTorch nightly following the instructions from [here](https://pytorch.org/). The snippet below shows the changes needed to enable this:
95+
96+
```diff
97+
+ torch.fx.experimental._config.use_duck_shape = False
98+
+ pipeline.transformer = torch.compile(
99+
pipeline.transformer, fullgraph=True, dynamic=True
100+
)
101+
```
102+
103+
This enables from 100% (on low resolutions) to a 30% (on 1536x1536 resolution) speed improvements.
104+
105+
Thanks to [AstraliteHeart](https://github.com/huggingface/diffusers/pull/11297/) who helped us rewrite the [`AuraFlowTransformer2DModel`] class so that the above works for different resolutions ([PR](https://github.com/huggingface/diffusers/pull/11297/)).
106+
92107
## AuraFlowPipeline
93108

94109
[[autodoc]] AuraFlowPipeline

src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def _get_llama_prompt_embeds(
344344
)
345345
prompt_embeds = self.text_encoder(
346346
**expanded_inputs,
347-
pixel_value=image_embeds,
347+
pixel_values=image_embeds,
348348
output_hidden_states=True,
349349
).hidden_states[-(num_hidden_layers_to_skip + 1)]
350350
prompt_embeds = prompt_embeds.to(dtype=dtype)

tests/pipelines/hunyuan_video/test_hunyuan_image2video.py

+25-9
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@
2424
CLIPTextModel,
2525
CLIPTokenizer,
2626
LlamaConfig,
27-
LlamaModel,
28-
LlamaTokenizer,
27+
LlamaTokenizerFast,
28+
LlavaConfig,
29+
LlavaForConditionalGeneration,
2930
)
31+
from transformers.models.clip import CLIPVisionConfig
3032

3133
from diffusers import (
3234
AutoencoderKLHunyuanVideo,
@@ -116,19 +118,29 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
116118
torch.manual_seed(0)
117119
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
118120

119-
llama_text_encoder_config = LlamaConfig(
121+
text_config = LlamaConfig(
120122
bos_token_id=0,
121123
eos_token_id=2,
122124
hidden_size=16,
123125
intermediate_size=37,
124126
layer_norm_eps=1e-05,
125127
num_attention_heads=4,
126128
num_hidden_layers=2,
127-
pad_token_id=1,
129+
pad_token_id=100,
128130
vocab_size=1000,
129131
hidden_act="gelu",
130132
projection_dim=32,
131133
)
134+
vision_config = CLIPVisionConfig(
135+
hidden_size=8,
136+
intermediate_size=37,
137+
projection_dim=32,
138+
num_attention_heads=4,
139+
num_hidden_layers=2,
140+
image_size=224,
141+
)
142+
llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101)
143+
132144
clip_text_encoder_config = CLIPTextConfig(
133145
bos_token_id=0,
134146
eos_token_id=2,
@@ -144,23 +156,23 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
144156
)
145157

146158
torch.manual_seed(0)
147-
text_encoder = LlamaModel(llama_text_encoder_config)
148-
tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
159+
text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config)
160+
tokenizer = LlamaTokenizerFast.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
149161

150162
torch.manual_seed(0)
151163
text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
152164
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
153165

154166
torch.manual_seed(0)
155167
image_processor = CLIPImageProcessor(
156-
crop_size=336,
168+
crop_size=224,
157169
do_center_crop=True,
158170
do_normalize=True,
159171
do_resize=True,
160172
image_mean=[0.48145466, 0.4578275, 0.40821073],
161173
image_std=[0.26862954, 0.26130258, 0.27577711],
162174
resample=3,
163-
size=336,
175+
size=224,
164176
)
165177

166178
components = {
@@ -190,14 +202,18 @@ def get_dummy_inputs(self, device, seed=0):
190202
"prompt_template": {
191203
"template": "{}",
192204
"crop_start": 0,
205+
"image_emb_len": 49,
206+
"image_emb_start": 5,
207+
"image_emb_end": 54,
208+
"double_return_token_id": 0,
193209
},
194210
"generator": generator,
195211
"num_inference_steps": 2,
196212
"guidance_scale": 4.5,
197213
"height": image_height,
198214
"width": image_width,
199215
"num_frames": 9,
200-
"max_sequence_length": 16,
216+
"max_sequence_length": 64,
201217
"output_type": "pt",
202218
}
203219
return inputs

0 commit comments

Comments
 (0)