Real-Time-SD-Turbo

Runtime error

App Files Files Community

radames commited on Mar 24

Commit

e5edfc8

1 Parent(s): 9a8789a

use pruna for quantization

Browse files

Files changed (7) hide show

server/config.py +7 -0
server/pipelines/controlnet.py +10 -1
server/pipelines/img2imgFlux.py +101 -35
server/pipelines/img2imgSDTurbo.py +10 -2
server/pipelines/img2imgSDXL-Lightning.py +8 -0
server/pipelines/img2imgSDXLTurbo.py +19 -3
server/requirements.txt +18 -13

server/config.py CHANGED Viewed

@@ -20,6 +20,7 @@ class Args(BaseModel):
     onediff: bool = False
     compel: bool = False
     debug: bool = False
     def pretty_print(self) -> None:
         print("\n")
@@ -123,6 +124,12 @@ parser.add_argument(
     default=False,
     help="Enable OneDiff",
 )
 parser.set_defaults(taesd=USE_TAESD)
 config = Args.model_validate(vars(parser.parse_args()))

     onediff: bool = False
     compel: bool = False
     debug: bool = False
+    pruna: bool = False
     def pretty_print(self) -> None:
         print("\n")
     default=False,
     help="Enable OneDiff",
 )
+parser.add_argument(
+    "--pruna",
+    action="store_true",
+    default=False,
+    help="Enable Pruna",
+)
 parser.set_defaults(taesd=USE_TAESD)
 config = Args.model_validate(vars(parser.parse_args()))

server/pipelines/controlnet.py CHANGED Viewed

@@ -17,6 +17,8 @@ from config import Args
 from pydantic import BaseModel, Field
 from PIL import Image
 import math
 base_model = "SimianLuo/LCM_Dreamshaper_v7"
 taesd_model = "madebyollin/taesd"
@@ -58,7 +60,7 @@ class Pipeline:
         input_mode: str = "image"
         page_content: str = page_content
-    class InputParams(BaseModel):
         prompt: str = Field(
             default_prompt,
             title="Prompt",
@@ -170,6 +172,13 @@ class Pipeline:
                 taesd_model, torch_dtype=torch_dtype, use_safetensors=True
             ).to(device)
         if args.sfast:
             print("\nRunning sfast compile\n")
             from sfast.compilers.stable_diffusion_pipeline_compiler import (

 from pydantic import BaseModel, Field
 from PIL import Image
 import math
+from pruna import SmashConfig, smash
+from util import ParamsModel
 base_model = "SimianLuo/LCM_Dreamshaper_v7"
 taesd_model = "madebyollin/taesd"
         input_mode: str = "image"
         page_content: str = page_content
+    class InputParams(ParamsModel):
         prompt: str = Field(
             default_prompt,
             title="Prompt",
                 taesd_model, torch_dtype=torch_dtype, use_safetensors=True
             ).to(device)
+        if args.pruna:
+            # Create and smash your model
+            smash_config = SmashConfig()
+            smash_config["cacher"] = "deepcache"
+            smash_config["compiler"] = "stable_fast"
+            self.pipe = smash(model=self.pipe, smash_config=smash_config)
         if args.sfast:
             print("\nRunning sfast compile\n")
             from sfast.compilers.stable_diffusion_pipeline_compiler import (

server/pipelines/img2imgFlux.py CHANGED Viewed

@@ -2,21 +2,19 @@ import torch
 from optimum.quanto import freeze, qfloat8, quantize
 from transformers.modeling_utils import PreTrainedModel
-from diffusers import (
-    FlowMatchEulerDiscreteScheduler,
-    AutoencoderKL,
-    AutoencoderTiny,
-    FluxImg2ImgPipeline,
-    FluxPipeline,
-)
-from diffusers import (
-    FluxImg2ImgPipeline,
-    FluxPipeline,
-    FluxTransformer2DModel,
-    GGUFQuantizationConfig,
-)
 try:
     import intel_extension_for_pytorch as ipex  # type: ignore
@@ -76,10 +74,10 @@ class Pipeline:
             1, min=1, max=15, title="Steps", field="range", hide=True, id="steps"
         )
         width: int = Field(
-            256, min=2, max=15, title="Width", disabled=True, hide=True, id="width"
         )
         height: int = Field(
-            256, min=2, max=15, title="Height", disabled=True, hide=True, id="height"
         )
         strength: float = Field(
             0.5,
@@ -107,33 +105,101 @@ class Pipeline:
         #     "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
         # )
         print("Loading model")
-        # ckpt_path: str = "https://huggingface.co/city96/FLUX.1-schnell-gguf/blob/main/flux1-schnell-Q6_K.gguf"
-        ckpt_path: str = "https://huggingface.co/city96/FLUX.1-schnell-gguf/blob/main/flux1-schnell-Q4_K_S.gguf"
-        transformer = FluxTransformer2DModel.from_single_file(
-            ckpt_path,
-            quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
-            torch_dtype=torch.bfloat16,
-        )
-        # else:
-        pipe = FluxImg2ImgPipeline.from_pretrained(
-            # "black-forest-labs/FLUX.1-dev",
-            "black-forest-labs/FLUX.1-Schnell",
             transformer=transformer,
-            torch_dtype=torch.bfloat16,
         )
-        if args.taesd:
-            pipe.vae = AutoencoderTiny.from_pretrained(
-                taesd_path, torch_dtype=torch.bfloat16, use_safetensors=True
-            )
         # pipe.enable_model_cpu_offload()
-        pipe = pipe.to(device)
         # pipe.enable_model_cpu_offload()
         self.pipe = pipe
         self.pipe.set_progress_bar_config(disable=True)
         #     vae = AutoencoderKL.from_pretrained(
         #         base_model_path, subfolder="vae", torch_dtype=torch_dtype
         # )

 from optimum.quanto import freeze, qfloat8, quantize
 from transformers.modeling_utils import PreTrainedModel
+from diffusers import AutoencoderTiny
+from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
+from diffusers.pipelines.flux.pipeline_flux_img2img import FluxImg2ImgPipeline
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL
+from pruna import smash, SmashConfig
+from pruna.telemetry import set_telemetry_metrics
+set_telemetry_metrics(False)  # disable telemetry for current session
+set_telemetry_metrics(False, set_as_default=True)  # disable telemetry globally
 try:
     import intel_extension_for_pytorch as ipex  # type: ignore
             1, min=1, max=15, title="Steps", field="range", hide=True, id="steps"
         )
         width: int = Field(
+            1024, min=2, max=15, title="Width", disabled=True, hide=True, id="width"
         )
         height: int = Field(
+            1024, min=2, max=15, title="Height", disabled=True, hide=True, id="height"
         )
         strength: float = Field(
             0.5,
         #     "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
         # )
         print("Loading model")
+        model_id = "black-forest-labs/FLUX.1-schnell"
+        model_revision = "refs/pr/1"
+        text_model_id = "openai/clip-vit-large-patch14"
+        model_data_type = torch.bfloat16
+        tokenizer = CLIPTokenizer.from_pretrained(
+            text_model_id, torch_dtype=model_data_type
+        )
+        text_encoder = CLIPTextModel.from_pretrained(
+            text_model_id, torch_dtype=model_data_type
+        )
+        # 2
+        tokenizer_2 = T5TokenizerFast.from_pretrained(
+            model_id,
+            subfolder="tokenizer_2",
+            torch_dtype=model_data_type,
+            revision=model_revision,
+        )
+        text_encoder_2 = T5EncoderModel.from_pretrained(
+            model_id,
+            subfolder="text_encoder_2",
+            torch_dtype=model_data_type,
+            revision=model_revision,
+        )
+        # Transformers
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            model_id, subfolder="scheduler", revision=model_revision
+        )
+        transformer = FluxTransformer2DModel.from_pretrained(
+            model_id,
+            subfolder="transformer",
+            torch_dtype=model_data_type,
+            revision=model_revision,
+        )
+        # VAE
+        # vae = AutoencoderKL.from_pretrained(
+        #     model_id,
+        #     subfolder="vae",
+        #     torch_dtype=model_data_type,
+        #     revision=model_revision,
+        # )
+        vae = AutoencoderTiny.from_pretrained(
+            "madebyollin/taef1", torch_dtype=torch.bfloat16
+        )
+        # Initialize the SmashConfig
+        smash_config = SmashConfig()
+        smash_config["quantizer"] = "quanto"
+        smash_config["quanto_calibrate"] = False
+        smash_config["quanto_weight_bits"] = "qint4"
+        # (
+        #     "qint4"  # "qfloat8"  # or "qint2", "qint4", "qint8"
+        # )
+        transformer = smash(
+            model=transformer,
+            smash_config=smash_config,
+        )
+        text_encoder_2 = smash(
+            model=text_encoder_2,
+            smash_config=smash_config,
+        )
+        pipe = FluxImg2ImgPipeline(
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+            vae=vae,
             transformer=transformer,
         )
+        # if args.taesd:
+        #     pipe.vae = AutoencoderTiny.from_pretrained(
+        #         taesd_path, torch_dtype=torch.bfloat16, use_safetensors=True
+        #     )
         # pipe.enable_model_cpu_offload()
+        pipe.text_encoder.to(device)
+        pipe.vae.to(device)
+        pipe.transformer.to(device)
+        pipe.text_encoder_2.to(device)
         # pipe.enable_model_cpu_offload()
+        # For added memory savings run this block, there is however a trade-off with speed.
+        # vae.enable_tiling()
+        # vae.enable_slicing()
+        # pipe.enable_sequential_cpu_offload()
         self.pipe = pipe
         self.pipe.set_progress_bar_config(disable=True)
         #     vae = AutoencoderKL.from_pretrained(
         #         base_model_path, subfolder="vae", torch_dtype=torch_dtype
         # )

server/pipelines/img2imgSDTurbo.py CHANGED Viewed

@@ -15,6 +15,7 @@ from PIL import Image
 from util import ParamsModel
 import math
 base_model = "stabilityai/sd-turbo"
 taesd_model = "madebyollin/taesd"
@@ -102,6 +103,13 @@ class Pipeline:
                 taesd_model, torch_dtype=torch_dtype, use_safetensors=True
             ).to(device)
         if args.sfast:
             from sfast.compilers.stable_diffusion_pipeline_compiler import (
                 compile,
@@ -130,8 +138,8 @@ class Pipeline:
         self.pipe.set_progress_bar_config(disable=True)
         self.pipe.to(device=device, dtype=torch_dtype)
-        if device.type != "mps":
-            self.pipe.unet.to(memory_format=torch.channels_last)
         if args.torch_compile:
             print("Running torch compile")

 from util import ParamsModel
 import math
+from pruna import smash, SmashConfig
 base_model = "stabilityai/sd-turbo"
 taesd_model = "madebyollin/taesd"
                 taesd_model, torch_dtype=torch_dtype, use_safetensors=True
             ).to(device)
+        if args.pruna:
+            # Create and smash your model
+            smash_config = SmashConfig()
+            smash_config["cacher"] = "deepcache"
+            smash_config["compiler"] = "stable_fast"
+            self.pipe = smash(model=self.pipe, smash_config=smash_config)
         if args.sfast:
             from sfast.compilers.stable_diffusion_pipeline_compiler import (
                 compile,
         self.pipe.set_progress_bar_config(disable=True)
         self.pipe.to(device=device, dtype=torch_dtype)
+        # if device.type != "mps":
+        #     self.pipe.unet.to(memory_format=torch.channels_last)
         if args.torch_compile:
             print("Running torch compile")

server/pipelines/img2imgSDXL-Lightning.py CHANGED Viewed

@@ -20,6 +20,7 @@ from pydantic import BaseModel, Field
 from PIL import Image
 from util import ParamsModel
 import math
 base = "stabilityai/stable-diffusion-xl-base-1.0"
 repo = "ByteDance/SDXL-Lightning"
@@ -135,6 +136,13 @@ class Pipeline:
             self.pipe.scheduler.config, timestep_spacing="trailing"
         )
         if args.sfast:
             from sfast.compilers.stable_diffusion_pipeline_compiler import (
                 compile,

 from PIL import Image
 from util import ParamsModel
 import math
+from pruna import SmashConfig, smash
 base = "stabilityai/stable-diffusion-xl-base-1.0"
 repo = "ByteDance/SDXL-Lightning"
             self.pipe.scheduler.config, timestep_spacing="trailing"
         )
+        if args.pruna:
+            # Create and smash your model
+            smash_config = SmashConfig()
+            smash_config["cacher"] = "deepcache"
+            smash_config["compiler"] = "stable_fast"
+            self.pipe = smash(model=self.pipe, smash_config=smash_config)
         if args.sfast:
             from sfast.compilers.stable_diffusion_pipeline_compiler import (
                 compile,

server/pipelines/img2imgSDXLTurbo.py CHANGED Viewed

@@ -17,6 +17,13 @@ from PIL import Image
 from util import ParamsModel
 import math
 base_model = "stabilityai/sdxl-turbo"
 taesd_model = "madebyollin/taesdxl"
@@ -104,10 +111,11 @@ class Pipeline:
         )
     def __init__(self, args: Args, device: torch.device, torch_dtype: torch.dtype):
-        self.pipe = AutoPipelineForImage2Image.from_pretrained(
             base_model,
             safety_checker=None,
         )
         if args.taesd:
             self.pipe.vae = AutoencoderTiny.from_pretrained(
                 taesd_model, torch_dtype=torch_dtype, use_safetensors=True
@@ -125,11 +133,16 @@ class Pipeline:
             config.enable_cuda_graph = True
             self.pipe = compile(self.pipe, config=config)
-        self.pipe.set_progress_bar_config(disable=True)
-        self.pipe.to(device=device, dtype=torch_dtype)
         if device.type != "mps":
             self.pipe.unet.to(memory_format=torch.channels_last)
         if args.torch_compile:
             print("Running torch compile")
             self.pipe.unet = torch.compile(
@@ -151,6 +164,9 @@ class Pipeline:
                 requires_pooled=[False, True],
             )
     def predict(self, params: "Pipeline.InputParams") -> Image.Image:
         generator = torch.manual_seed(params.seed)
         prompt = params.prompt

 from util import ParamsModel
 import math
+from pruna import smash, SmashConfig
+from pruna.telemetry import set_telemetry_metrics
+set_telemetry_metrics(False)  # disable telemetry for current session
+set_telemetry_metrics(False, set_as_default=True)  # disable telemetry globally
 base_model = "stabilityai/sdxl-turbo"
 taesd_model = "madebyollin/taesdxl"
         )
     def __init__(self, args: Args, device: torch.device, torch_dtype: torch.dtype):
+        base_pipe = AutoPipelineForImage2Image.from_pretrained(
             base_model,
             safety_checker=None,
         )
+        self.pipe = None
         if args.taesd:
             self.pipe.vae = AutoencoderTiny.from_pretrained(
                 taesd_model, torch_dtype=torch_dtype, use_safetensors=True
             config.enable_cuda_graph = True
             self.pipe = compile(self.pipe, config=config)
         if device.type != "mps":
             self.pipe.unet.to(memory_format=torch.channels_last)
+        if args.pruna:
+            # Create and smash your model
+            smash_config = SmashConfig()
+            smash_config["cacher"] = "deepcache"
+            smash_config["compiler"] = "stable_fast"
+            self.pipe = smash(model=base_pipe, smash_config=smash_config)
         if args.torch_compile:
             print("Running torch compile")
             self.pipe.unet = torch.compile(
                 requires_pooled=[False, True],
             )
+        self.pipe.set_progress_bar_config(disable=True)
+        self.pipe.to(device=device, dtype=torch_dtype)
     def predict(self, params: "Pipeline.InputParams") -> Image.Image:
         generator = torch.manual_seed(params.seed)
         prompt = params.prompt

server/requirements.txt CHANGED Viewed

@@ -1,30 +1,35 @@
-diffusers==0.32.0
-transformers==4.47.1
 huggingface-hub
 hf_transfer
---extra-index-url https://download.pytorch.org/whl/cu121;
-torch==2.3.0
-fastapi==0.115.6
-uvicorn[standard]==0.34.0
 Pillow==11.0.0
-accelerate==1.2.1
 compel==2.0.2
 controlnet-aux==0.0.9
 peft==0.14.0
-xformers; sys_platform != 'darwin' or platform_machine != 'arm64'
 markdown2
 safetensors
-stable_fast @ https://github.com/chengzeyi/stable-fast/releases/download/nightly/stable_fast-1.0.5.dev20241127+torch230cu121-cp310-cp310-manylinux2014_x86_64.whl ; sys_platform != 'darwin' or platform_machine != 'arm64'
 #oneflow @ https://github.com/siliconflow/oneflow_releases/releases/download/community_cu121/oneflow-0.9.1.dev20241114%2Bcu121-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ; sys_platform != 'darwin' or platform_machine != 'arm64'
 #onediff @ git+https://github.com/siliconflow/onediff.git@main#egg=onediff ; sys_platform != 'darwin' or platform_machine != 'arm64'
 setuptools
 mpmath==1.3.0
-numpy==1.*
 controlnet-aux
 sentencepiece==0.2.0
-optimum-quanto
 gguf==0.13.0
-pydantic>=2.7.0
 types-Pillow
 mypy
-python-dotenv

+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.5.1
+torchvision
+torchaudio
+xformers; sys_platform != 'darwin' or platform_machine != 'arm64'
+numpy
+diffusers
+llvmlite>=0.39.0
+numba>=0.56.0
+pruna[stable-fast] ; sys_platform != 'darwin' or platform_machine != 'arm64'
+transformers
+pydantic
 huggingface-hub
 hf_transfer
+fastapi
+uvicorn[standard]
 Pillow==11.0.0
+accelerate
 compel==2.0.2
 controlnet-aux==0.0.9
 peft==0.14.0
 markdown2
 safetensors
+# stable_fast @ https://github.com/chengzeyi/stable-fast/releases/download/nightly/stable_fast-1.0.5.dev20241127+torch230cu121-cp310-cp310-manylinux2014_x86_64.whl ; sys_platform != 'darwin' or platform_machine != 'arm64'
 #oneflow @ https://github.com/siliconflow/oneflow_releases/releases/download/community_cu121/oneflow-0.9.1.dev20241114%2Bcu121-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ; sys_platform != 'darwin' or platform_machine != 'arm64'
 #onediff @ git+https://github.com/siliconflow/onediff.git@main#egg=onediff ; sys_platform != 'darwin' or platform_machine != 'arm64'
 setuptools
 mpmath==1.3.0
 controlnet-aux
 sentencepiece==0.2.0
+optimum-quanto # has to be optimum-quanto==0.2.5 for pruna int4
 gguf==0.13.0
 types-Pillow
 mypy
+python-dotenv