FLUX.1-dev-ControlNet-Union-Pro-2.0

Running on Zero

cbensimon HF Staff commited on Sep 12

Commit

f0c48c3

verified ·

1 Parent(s): 9922d45

10x demo speedup (#1)

- AOTI load (4d319cb3623ae213865e0c9729298ca0c85ce255)

Co-authored-by: Charles Bensimon <cbensimon@users.noreply.huggingface.co>

Files changed (4) hide show

aoti.py ADDED Viewed

+"""
+"""
+import torch
+from huggingface_hub import hf_hub_download
+from spaces.zero.torch.aoti import ZeroGPUCompiledModel
+from spaces.zero.torch.aoti import ZeroGPUWeights
+def aoti_load(module: torch.nn.Module, repo_id: str):
+    repeated_blocks = module._repeated_blocks
+    aoti_files = {name: hf_hub_download(repo_id, f'{name}.pt2') for name in repeated_blocks}
+    for block_name, aoti_file in aoti_files.items():
+        for block in module.modules():
+            if block.__class__.__name__ == block_name:
+                weights = ZeroGPUWeights(block.state_dict())
+                block.forward = ZeroGPUCompiledModel(aoti_file, weights)

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import sys
 sys.path.append('./')
 import gradio as gr
 import spaces
-import os
 import sys
 import subprocess
 import numpy as np
@@ -61,10 +63,11 @@ canny = CannyDetector()
 anyline = AnylineDetector.from_pretrained("TheMistoAI/MistoLine", filename="MTEED.pth", subfolder="Anyline")
 open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
-torch.backends.cuda.matmul.allow_tf32 = True
-pipe.vae.enable_tiling()
-pipe.vae.enable_slicing()
-pipe.enable_model_cpu_offload() # for saving memory
 def convert_from_image_to_cv2(img: Image) -> np.ndarray:
     return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

+import os
+os.system("pip install --upgrade spaces")
 import sys
 sys.path.append('./')
 import gradio as gr
 import spaces
 import sys
 import subprocess
 import numpy as np
 anyline = AnylineDetector.from_pretrained("TheMistoAI/MistoLine", filename="MTEED.pth", subfolder="Anyline")
 open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
+import fa3
+from aoti import aoti_load
+pipe.transformer.fuse_qkv_projections()
+aoti_load(pipe.transformer, 'zerogpu-aoti/FLUX.1')
 def convert_from_image_to_cv2(img: Image) -> np.ndarray:
     return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

fa3.py ADDED Viewed

+"""
+"""
+import torch
+from kernels import get_kernel
+_flash_attn_func = get_kernel("kernels-community/vllm-flash-attn3").flash_attn_func
+@torch.library.custom_op("flash::flash_attn_func", mutates_args=())
+def flash_attn_func(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    outputs, lse = _flash_attn_func(q, k, v)
+    return outputs
+@flash_attn_func.register_fake
+def _(q, k, v, **kwargs):
+    return torch.empty_like(q).contiguous()

requirements.txt CHANGED Viewed

@@ -14,4 +14,5 @@ xformers
 sentencepiece
 peft
 scipy
-scikit-image

 sentencepiece
 peft
 scipy
+scikit-image
+kernels