24 GB (~22918MiB to be exact) via torchao on 4090

#9
by C0nsumption - opened
Input Output Controlnet

This will get you going. Currently have it in CLI format so this was just a quick and dirty test.
Remember to reference the Qwen team/Diffusers team code for the main weights even when using this.
Everything seems to be working fine, I will drop some more examples soon

#!/usr/bin/env python3
# qwen_edit_test.py

import sys
import time
from pathlib import Path

import torch
from PIL import Image
from diffusers import AutoModel, DiffusionPipeline, TorchAoConfig

# 🚑 TorchAO hotfix: patch missing operator for quantized tensors
from torchao.dtypes.affine_quantized_tensor import AffineQuantizedTensor


def _safe_has_compatible_shallow_copy_type(t1, t2):
    # Always return True to bypass crash inside .to() / Accelerate offload
    return True


torch._has_compatible_shallow_copy_type = _safe_has_compatible_shallow_copy_type
AffineQuantizedTensor.__torch_function__ = torch._C._disabled_torch_function_impl  # disable torch_function dispatch


def main():
    # WHERE YOUR ACTUAL SETTINGS GO
    # 🔧 Hardcoded config

    model_dir = Path("./Qwen-Image-Edit-2509")
    input_paths = [Path("./sub.jpeg"), Path("./pose1.png")]
    out_path = Path("qwen_edit_test.png")
    lora_path = None  # set Path(".../weights.safetensors") if needed

    prompt = """Convert subject in image 1 to match the pose in image 2, keep the character the same and the
    background consistent with the inside of a fast food store. ensure to keep the face, clothing, and hair
    consistent. She should be sitting on the counter with the cash registers and advertisements to her back"""
    negative_prompt = "ugly"

    steps = 40
    seed = 1234

    # sanity checks
    if not model_dir.exists():
        print(f"[ERR] model_dir not found: {model_dir}", file=sys.stderr)
        sys.exit(2)
    for pth in input_paths:
        if not pth.is_file():
            print(f"[ERR] input image not found: {pth}", file=sys.stderr)
            sys.exit(2)
    if lora_path and not lora_path.is_file():
        print(f"[ERR] lora_path not found: {lora_path}", file=sys.stderr)
        sys.exit(2)

    out_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"[INFO] model_dir={model_dir} lora={lora_path} inputs={input_paths}")
    t0 = time.time()

    # TorchAO quantization config
    torch_dtype = torch.bfloat16
    quantization_config = TorchAoConfig("int8wo")

    # Quantized transformer backbone
    transformer = AutoModel.from_pretrained(
        str(model_dir),
        subfolder="transformer",
        quantization_config=quantization_config,
        torch_dtype=torch_dtype,
    )

    # Build pipeline
    pipe = DiffusionPipeline.from_pretrained(
        str(model_dir),
        transformer=transformer,
        torch_dtype=torch_dtype,
    )

    if lora_path:
        pipe.load_lora_weights(str(lora_path))

    # Save VRAM
    pipe.enable_model_cpu_offload()

    generator = torch.manual_seed(seed) if seed is not None else None

    # Load input images
    images = [Image.open(str(p)).convert("RGB") for p in input_paths]

    print("[INFO] editing…")
    kwargs = dict(
        image=images if len(images) > 1 else images[0],
        prompt=prompt,
        negative_prompt=negative_prompt,
        generator=generator,
        num_inference_steps=steps,
        num_images_per_prompt=1,
    )

    with torch.inference_mode():
        result = pipe(**kwargs)
        img = result.images[0]

    img.save(out_path)
    dt = time.time() - t0
    print(f"[OK] saved → {out_path}  ({dt:.2f}s)")


if __name__ == "__main__":
    main()
Setting Subject 1 Subject 2

OUTPUT




Place the girl from image1 and the girl from 
image2 on the sofa in image3, drinking coffee.
ensure to keep the face, clothing, and hair consistent.

I'd say, considering that it's quantized and ran on 24 GB of VRAM, it did pretty darn good 🧍🏽‍♂️

Hey there, good work. I was wondering how long this process usually takes to complete?

Sign up or log in to comment