Spaces:

orange15
/

wayfu-amazing-art-v2

Build error

App Files Files Community

orange15 commited on Apr 23

Commit

397cbc8

verified ·

1 Parent(s): 10adb8c

Upload 69 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +65 -35
Dockerfile +21 -0
README.md +13 -10
app.py +35 -0
check_packages.py +4 -0
log.py +78 -0
requirements.txt +10 -0
wfControl/LICENSE +201 -0
wfControl/README.md +170 -0
wfControl/assets/book.jpg +0 -0
wfControl/assets/cartoon_boy.png +3 -0
wfControl/assets/clock.jpg +3 -0
wfControl/assets/coffee.png +0 -0
wfControl/assets/demo/book_omini.jpg +0 -0
wfControl/assets/demo/clock_omini.jpg +0 -0
wfControl/assets/demo/demo_this_is_omini_control.jpg +3 -0
wfControl/assets/demo/dreambooth_res.jpg +3 -0
wfControl/assets/demo/man_omini.jpg +0 -0
wfControl/assets/demo/monalisa_omini.jpg +3 -0
wfControl/assets/demo/oranges_omini.jpg +0 -0
wfControl/assets/demo/panda_omini.jpg +0 -0
wfControl/assets/demo/penguin_omini.jpg +0 -0
wfControl/assets/demo/rc_car_omini.jpg +0 -0
wfControl/assets/demo/room_corner_canny.jpg +0 -0
wfControl/assets/demo/room_corner_coloring.jpg +0 -0
wfControl/assets/demo/room_corner_deblurring.jpg +0 -0
wfControl/assets/demo/room_corner_depth.jpg +0 -0
wfControl/assets/demo/scene_variation.jpg +3 -0
wfControl/assets/demo/shirt_omini.jpg +0 -0
wfControl/assets/demo/try_on.jpg +3 -0
wfControl/assets/monalisa.jpg +3 -0
wfControl/assets/oranges.jpg +0 -0
wfControl/assets/penguin.jpg +0 -0
wfControl/assets/rc_car.jpg +3 -0
wfControl/assets/room_corner.jpg +3 -0
wfControl/assets/test_in.jpg +0 -0
wfControl/assets/test_out.jpg +0 -0
wfControl/assets/tshirt.jpg +3 -0
wfControl/assets/vase.jpg +0 -0
wfControl/assets/vase_hq.jpg +3 -0
wfControl/examples/inpainting.ipynb +143 -0
wfControl/examples/spatial.ipynb +184 -0
wfControl/examples/subject.ipynb +214 -0
wfControl/examples/subject_1024.ipynb +221 -0
wfControl/requirements.txt +7 -0
wfControl/src/flux/block.py +339 -0
wfControl/src/flux/condition.py +138 -0
wfControl/src/flux/generate.py +321 -0
wfControl/src/flux/lora_controller.py +75 -0
wfControl/src/flux/pipeline_tools.py +52 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,65 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/cartoon_boy.png filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/clock.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/demo/demo_this_is_omini_control.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/demo/dreambooth_res.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/demo/monalisa_omini.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/demo/scene_variation.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/demo/try_on.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/monalisa.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/rc_car.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/room_corner.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/tshirt.jpg filter=lfs diff=lfs merge=lfs -text
+OminiControl/assets/vase_hq.jpg filter=lfs diff=lfs merge=lfs -text
+examples/breakingbad.jpg filter=lfs diff=lfs merge=lfs -text
+examples/DistractedBoyfriend.webp filter=lfs diff=lfs merge=lfs -text
+examples/doge.jpg filter=lfs diff=lfs merge=lfs -text
+examples/oiiai.png filter=lfs diff=lfs merge=lfs -text
+examples/PulpFiction.jpg filter=lfs diff=lfs merge=lfs -text
+examples/steve.webp filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/cartoon_boy.png filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/clock.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/demo/demo_this_is_omini_control.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/demo/dreambooth_res.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/demo/monalisa_omini.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/demo/scene_variation.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/demo/try_on.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/monalisa.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/rc_car.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/room_corner.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/tshirt.jpg filter=lfs diff=lfs merge=lfs -text
+wfControl/assets/vase_hq.jpg filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY . .
+RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 git
+# Gỡ bản cũ (nếu bị inject sẵn)
+RUN pip uninstall -y diffusers huggingface_hub || true
+# Clone và cài diffusers v0.17.1 từ GitHub
+RUN git clone --branch v0.17.1 https://github.com/huggingface/diffusers.git /tmp/diffusers && \
+    pip install /tmp/diffusers && \
+    rm -rf /tmp/diffusers
+RUN pip install huggingface_hub==0.17.3
+RUN pip install --no-cache-dir -r requirements.txt
+# 🔍 CMD để test xem diffusers đã thực sự được cài chưa
+CMD ["python", "-c", "import diffusers; print('✅ Diffusers version:', diffusers.__version__)"]

README.md CHANGED Viewed

@@ -1,10 +1,13 @@
----
-title: Wayfu Amazing Art V2
-emoji: 🌖
-colorFrom: blue
-colorTo: blue
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Wayfu Art V2
+emoji: 🎨
+colorFrom: green
+colorTo: red
+sdk: docker
+sdk_version: 5.23.2
+app_file: app.py
+pinned: false
+license: unknown
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import io, base64, traceback
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+from PIL import Image
+try:
+    from wfcontrol import generate_image
+except Exception as e:
+    import traceback
+    print("🔥 Import failed:")
+    print(traceback.format_exc())
+    raise e
+app = FastAPI()
+@app.post("/transform")
+async def transform_image_api(request: Request):
+    try:
+        data = await request.json()
+        img_data = data["image"].split(",")[1]
+        prompt = data.get("prompt", "Studio Ghibli")
+        style = data.get("style", "ghibli")
+        img = Image.open(io.BytesIO(base64.b64decode(img_data))).convert("RGB")
+        result = generate_image(input_image=img, prompt=prompt, style=style)
+        buffer = io.BytesIO()
+        result.save(buffer, format="JPEG")
+        result_b64 = base64.b64encode(buffer.getvalue()).decode()
+        return JSONResponse(content={"image": "data:image/jpeg;base64," + result_b64})
+    except Exception as e:
+        print("🔥 ERROR:", traceback.format_exc())
+        return JSONResponse(status_code=500, content={"error": str(e)})

check_packages.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import subprocess
+output = subprocess.getoutput("pip list")
+print("📦 Installed packages:")
+print(output)

log.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import boto3
+import uuid
+import time
+import os
+from PIL import Image
+from io import BytesIO
+MAX_PIXELS = 2048
+AWS_BUCKET_NAME = os.environ.get("AWS_BUCKET_NAME", "")
+AWS_INFERENCE_LOG_TABLE = os.environ.get("AWS_INFERENCE_LOG_TABLE", "")
+AWS_FEEDBACK_LOG_TABLE = os.environ.get("AWS_FEEDBACK_LOG_TABLE", "")
+AWS_REGION = os.environ.get("AWS_REGION", "")
+AWS_ACCESS_ID = os.environ.get("AWS_ACCESS_ID", "")
+AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY", "")
+aws_cfg = {
+    "aws_access_key_id": AWS_ACCESS_ID,
+    "aws_secret_access_key": AWS_ACCESS_KEY,
+    "region_name": AWS_REGION,
+}
+s3_client = boto3.client("s3", **aws_cfg)
+dynamodb = boto3.resource("dynamodb", **aws_cfg)
+inference_log = dynamodb.Table(AWS_INFERENCE_LOG_TABLE)
+feedback_log = dynamodb.Table(AWS_FEEDBACK_LOG_TABLE)
+def get_metadata():
+    return {
+        "_id": uuid.uuid4().hex,
+        "created_at": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+    }
+def insert_log(table_type: str, data: dict):
+    assert table_type in ["inference", "feedback"], "Invalid table type"
+    table = inference_log if table_type == "inference" else feedback_log
+    metadata = get_metadata()
+    response = table.put_item(
+        Item={
+            **data,
+            **metadata,
+        }
+    )
+    return response, metadata["_id"]
+# Example usage:
+# insert_log("inference", {"data": "test"})
+# insert_log("feedback", {"data": "test"})
+def get_image_obj(image: Image) -> BytesIO:
+    image.thumbnail((MAX_PIXELS, MAX_PIXELS))
+    image_obj = BytesIO()
+    image.save(image_obj, format="WEBP")
+    image_obj.seek(0)
+    return image_obj
+def log_image(image: Image) -> str:
+    metadata = get_metadata()
+    image_obj = get_image_obj(image)
+    s3_key = f"images/{metadata['_id']}.webp"
+    s3_client.upload_fileobj(image_obj, AWS_BUCKET_NAME, s3_key)
+    return metadata["_id"]
+# Example usage:
+# image = Image.open("examples/doge.jpg")
+# log_image(image)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+uvicorn
+Pillow
+numpy
+torch==2.0.1
+transformers==4.30.2
+peft
+opencv-python-headless
+scipy
+safetensors

wfControl/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2024] [Zhenxiong Tan]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

wfControl/README.md ADDED Viewed

	@@ -0,0 +1,170 @@

+# OminiControl
+<img src='./assets/demo/demo_this_is_omini_control.jpg' width='100%' />
+<br>
+<a href="https://arxiv.org/abs/2411.15098"><img src="https://img.shields.io/badge/ariXv-2411.15098-A42C25.svg" alt="arXiv"></a>
+<a href="https://huggingface.co/Yuanshi/OminiControl"><img src="https://img.shields.io/badge/🤗_HuggingFace-Model-ffbd45.svg" alt="HuggingFace"></a>
+<a href="https://huggingface.co/spaces/Yuanshi/OminiControl"><img src="https://img.shields.io/badge/🤗_HuggingFace-Space-ffbd45.svg" alt="HuggingFace"></a>
+<a href="https://github.com/Yuanshi9815/Subjects200K"><img src="https://img.shields.io/badge/GitHub-Dataset-blue.svg?logo=github&" alt="GitHub"></a>
+<a href="https://huggingface.co/datasets/Yuanshi/Subjects200K"><img src="https://img.shields.io/badge/🤗_HuggingFace-Dataset-ffbd45.svg" alt="HuggingFace"></a>
+> **OminiControl: Minimal and Universal Control for Diffusion Transformer**
+> <br>
+> Zhenxiong Tan,
+> [Songhua Liu](http://121.37.94.87/),
+> [Xingyi Yang](https://adamdad.github.io/),
+> Qiaochu Xue,
+> and
+> [Xinchao Wang](https://sites.google.com/site/sitexinchaowang/)
+> <br>
+> [Learning and Vision Lab](http://lv-nus.org/), National University of Singapore
+> <br>
+## Features
+OminiControl is a minimal yet powerful universal control framework for Diffusion Transformer models like [FLUX](https://github.com/black-forest-labs/flux).
+* **Universal Control 🌐**:  A unified control framework that supports both subject-driven control and spatial control (such as edge-guided and in-painting generation).
+* **Minimal Design 🚀**: Injects control signals while preserving original model structure. Only introduces 0.1% additional parameters to the base model.
+## News
+- **2024-12-26**: ⭐️ Training code are released. Now you can create your own OminiControl model by customizing any control tasks (3D, multi-view, pose-guided, try-on, etc.) with the FLUX model. Check the [training folder](./train) for more details.
+## Quick Start
+### Setup (Optional)
+1. **Environment setup**
+```bash
+conda create -n omini python=3.10
+conda activate omini
+```
+2. **Requirements installation**
+```bash
+pip install -r requirements.txt
+```
+### Usage example
+1. Subject-driven generation: `examples/subject.ipynb`
+2. In-painting: `examples/inpainting.ipynb`
+3. Canny edge to image, depth to image, colorization, deblurring: `examples/spatial.ipynb`
+### Gradio app
+To run the Gradio app for subject-driven generation:
+```bash
+python -m src.gradio.gradio_app
+```
+### Guidelines for subject-driven generation
+1. Input images are automatically center-cropped and resized to 512x512 resolution.
+2. When writing prompts, refer to the subject using phrases like `this item`, `the object`, or `it`. e.g.
+   1. *A close up view of this item. It is placed on a wooden table.*
+   2. *A young lady is wearing this shirt.*
+3. The model primarily works with objects rather than human subjects currently, due to the absence of human data in training.
+## Generated samples
+### Subject-driven generation
+<a href="https://huggingface.co/spaces/Yuanshi/OminiControl"><img src="https://img.shields.io/badge/🤗_HuggingFace-Space-ffbd45.svg" alt="HuggingFace"></a>
+**Demos** (Left: condition image; Right: generated image)
+<div float="left">
+  <img src='./assets/demo/oranges_omini.jpg' width='48%'/>
+  <img src='./assets/demo/rc_car_omini.jpg' width='48%' />
+  <img src='./assets/demo/clock_omini.jpg' width='48%' />
+  <img src='./assets/demo/shirt_omini.jpg' width='48%' />
+</div>
+<details>
+<summary>Text Prompts</summary>
+- Prompt1: *A close up view of this item. It is placed on a wooden table. The background is a dark room, the TV is on, and the screen is showing a cooking show. With text on the screen that reads 'Omini Control!.'*
+- Prompt2: *A film style shot. On the moon, this item drives across the moon surface. A flag on it reads 'Omini'. The background is that Earth looms large in the foreground.*
+- Prompt3: *In a Bauhaus style room, this item is placed on a shiny glass table, with a vase of flowers next to it. In the afternoon sun, the shadows of the blinds are cast on the wall.*
+- Prompt4: *"On the beach, a lady sits under a beach umbrella with 'Omini' written on it. She's wearing this shirt and has a big smile on her face, with her surfboard hehind her. The sun is setting in the background. The sky is a beautiful shade of orange and purple."*
+</details>
+<details>
+<summary>More results</summary>
+* Try on:
+  <img src='./assets/demo/try_on.jpg'/>
+* Scene variations:
+  <img src='./assets/demo/scene_variation.jpg'/>
+* Dreambooth dataset:
+  <img src='./assets/demo/dreambooth_res.jpg'/>
+* Oye-cartoon finetune:
+  <div float="left">
+    <img src='./assets/demo/man_omini.jpg' width='48%' />
+    <img src='./assets/demo/panda_omini.jpg' width='48%' />
+  </div>
+</details>
+### Spatially aligned control
+1. **Image Inpainting** (Left: original image; Center: masked image; Right: filled image)
+  - Prompt: *The Mona Lisa is wearing a white VR headset with 'Omini' written on it.*
+    </br>
+    <img src='./assets/demo/monalisa_omini.jpg' width='700px' />
+  - Prompt: *A yellow book with the word 'OMINI' in large font on the cover. The text 'for FLUX' appears at the bottom.*
+    </br>
+    <img src='./assets/demo/book_omini.jpg' width='700px' />
+2. **Other spatially aligned tasks**  (Canny edge to image, depth to image, colorization, deblurring)
+    </br>
+    <details>
+    <summary>Click to show</summary>
+    <div float="left">
+      <img src='./assets/demo/room_corner_canny.jpg' width='48%'/>
+      <img src='./assets/demo/room_corner_depth.jpg' width='48%' />
+      <img src='./assets/demo/room_corner_coloring.jpg' width='48%' />
+      <img src='./assets/demo/room_corner_deblurring.jpg' width='48%' />
+    </div>
+    Prompt: *A light gray sofa stands against a white wall, featuring a black and white geometric patterned pillow. A white side table sits next to the sofa, topped with a white adjustable desk lamp and some books. Dark hardwood flooring contrasts with the pale walls and furniture.*
+    </details>
+## Models
+**Subject-driven control:**
+| Model                                                                                            | Base model     | Description                                                                                              | Resolution   |
+| ------------------------------------------------------------------------------------------------ | -------------- | -------------------------------------------------------------------------------------------------------- | ------------ |
+| [`experimental`](https://huggingface.co/Yuanshi/OminiControl/tree/main/experimental) / `subject` | FLUX.1-schnell | The model used in the paper.                                                                             | (512, 512)   |
+| [`omini`](https://huggingface.co/Yuanshi/OminiControl/tree/main/omini) / `subject_512`           | FLUX.1-schnell | The model has been fine-tuned on a larger dataset.                                                       | (512, 512)   |
+| [`omini`](https://huggingface.co/Yuanshi/OminiControl/tree/main/omini) / `subject_1024`          | FLUX.1-schnell | The model has been fine-tuned on a larger dataset and accommodates higher resolution.   (To be released) | (1024, 1024) |
+| [`oye-cartoon`](https://huggingface.co/saquiboye/oye-cartoon)          | FLUX.1-dev | The model has been fine-tuned on [oye-cartoon](https://huggingface.co/datasets/saquiboye/oye-cartoon) dataset by [@saquib764](https://github.com/Saquib764) | (512, 512) |
+**Spatial aligned control:**
+| Model                                                                                                     | Base model | Description                                                                | Resolution   |
+| --------------------------------------------------------------------------------------------------------- | ---------- | -------------------------------------------------------------------------- | ------------ |
+| [`experimental`](https://huggingface.co/Yuanshi/OminiControl/tree/main/experimental) / `<task_name>`      | FLUX.1     | Canny edge to image, depth to image, colorization, deblurring, in-painting | (512, 512)   |
+| [`experimental`](https://huggingface.co/Yuanshi/OminiControl/tree/main/experimental) / `<task_name>_1024` | FLUX.1     | Supports higher resolution.(To be released)                                | (1024, 1024) |
+## Community Extensions
+- [ComfyUI-Diffusers-OminiControl](https://github.com/Macoron/ComfyUI-Diffusers-OminiControl) - ComfyUI integration by [@Macoron](https://github.com/Macoron)
+- [ComfyUI_RH_OminiControl](https://github.com/HM-RunningHub/ComfyUI_RH_OminiControl) - ComfyUI integration by [@HM-RunningHub](https://github.com/HM-RunningHub)
+## Limitations
+1. The model's subject-driven generation primarily works with objects rather than human subjects due to the absence of human data in training.
+2. The subject-driven generation model may not work well with `FLUX.1-dev`.
+3. The released model currently only supports the resolution of 512x512.
+## Training
+Training instructions can be found in this [folder](./train).
+## To-do
+- [x] Release the training code.
+- [ ] Release the model for higher resolution (1024x1024).
+## Citation
+```
+@article{tan2024ominicontrol,
+  title={Ominicontrol: Minimal and universal control for diffusion transformer},
+  author={Tan, Zhenxiong and Liu, Songhua and Yang, Xingyi and Xue, Qiaochu and Wang, Xinchao},
+  journal={arXiv preprint arXiv:2411.15098},
+  volume={3},
+  year={2024}
+}
+```

wfControl/assets/book.jpg ADDED Viewed

wfControl/assets/cartoon_boy.png ADDED Viewed

Git LFS Details

SHA256: d4a82c0f9ed09b9468bded7d901beffaf29addc30ed5f72ad72451e1b6344b1c
Pointer size: 131 Bytes
Size of remote file: 429 kB

wfControl/assets/clock.jpg ADDED Viewed

Git LFS Details

SHA256: 41235973f26152ac92d32bfc166fb5f9f1e352c5e16807920238473316ec462b
Pointer size: 131 Bytes
Size of remote file: 289 kB

wfControl/assets/coffee.png ADDED Viewed

wfControl/assets/demo/book_omini.jpg ADDED Viewed

wfControl/assets/demo/clock_omini.jpg ADDED Viewed

wfControl/assets/demo/demo_this_is_omini_control.jpg ADDED Viewed

Git LFS Details

SHA256: 798b7c25be6be118dc0de97c444c840869afca633a0d48f99d940aec040a7518
Pointer size: 131 Bytes
Size of remote file: 129 kB

wfControl/assets/demo/dreambooth_res.jpg ADDED Viewed

Git LFS Details

SHA256: ba36bd861989564dc679acf3b5e56f382f1a11b1596e6f611ea0bd7d81b89680
Pointer size: 132 Bytes
Size of remote file: 1.94 MB

wfControl/assets/demo/man_omini.jpg ADDED Viewed

wfControl/assets/demo/monalisa_omini.jpg ADDED Viewed

Git LFS Details

SHA256: e5ca6c2bf44f19d216b2eb16dcc67d19f11d87220d3ee80f5e5e1ad98a5536dc
Pointer size: 131 Bytes
Size of remote file: 133 kB

wfControl/assets/demo/oranges_omini.jpg ADDED Viewed

wfControl/assets/demo/panda_omini.jpg ADDED Viewed

wfControl/assets/demo/penguin_omini.jpg ADDED Viewed

wfControl/assets/demo/rc_car_omini.jpg ADDED Viewed

wfControl/assets/demo/room_corner_canny.jpg ADDED Viewed

wfControl/assets/demo/room_corner_coloring.jpg ADDED Viewed

wfControl/assets/demo/room_corner_deblurring.jpg ADDED Viewed

wfControl/assets/demo/room_corner_depth.jpg ADDED Viewed

wfControl/assets/demo/scene_variation.jpg ADDED Viewed

Git LFS Details

SHA256: 39e4e16d2eeb58b3775b6d34c8b3e125d0d19cc36fa90b07c6c8d57624ad4333
Pointer size: 131 Bytes
Size of remote file: 958 kB

wfControl/assets/demo/shirt_omini.jpg ADDED Viewed

wfControl/assets/demo/try_on.jpg ADDED Viewed

Git LFS Details

SHA256: 6adce5194329a83f0109b4375e00667c341879e64fb55831c70ea3f3b2f99f7e
Pointer size: 131 Bytes
Size of remote file: 774 kB

wfControl/assets/monalisa.jpg ADDED Viewed

Git LFS Details

SHA256: 188b8b6499e4541f9dfef2a9daf6f1eb920079c9208f587fd97566d6aa4a9719
Pointer size: 131 Bytes
Size of remote file: 353 kB

wfControl/assets/oranges.jpg ADDED Viewed

wfControl/assets/penguin.jpg ADDED Viewed

wfControl/assets/rc_car.jpg ADDED Viewed

Git LFS Details

SHA256: ae8aed11029fa3b084deb286c07a8cab5056840c9c123816fe2b504e94233e95
Pointer size: 131 Bytes
Size of remote file: 254 kB

wfControl/assets/room_corner.jpg ADDED Viewed

Git LFS Details

SHA256: f97bd63df05f5f15ad5dd1a2ccef803e74e12caadd8fe145493fd6d5219045e7
Pointer size: 131 Bytes
Size of remote file: 236 kB

wfControl/assets/test_in.jpg ADDED Viewed

wfControl/assets/test_out.jpg ADDED Viewed

wfControl/assets/tshirt.jpg ADDED Viewed

Git LFS Details

SHA256: cb1803315765302113a9e7a64dedd4ecba2672028cf093cbc33ef2edd2247c39
Pointer size: 131 Bytes
Size of remote file: 301 kB

wfControl/assets/vase.jpg ADDED Viewed

wfControl/assets/vase_hq.jpg ADDED Viewed

Git LFS Details

SHA256: 279905e32116792f118802d23b0d96629d98ccbdac9e704e65eaf2e98c752679
Pointer size: 132 Bytes
Size of remote file: 2.9 MB

wfControl/examples/inpainting.ipynb ADDED Viewed

	@@ -0,0 +1,143 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.chdir(\"..\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from diffusers.pipelines import FluxPipeline\n",
+    "from src.flux.condition import Condition\n",
+    "from PIL import Image\n",
+    "\n",
+    "from src.flux.generate import generate, seed_everything"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = FluxPipeline.from_pretrained(\n",
+    "    \"black-forest-labs/FLUX.1-dev\", torch_dtype=torch.bfloat16\n",
+    ")\n",
+    "pipe = pipe.to(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.load_lora_weights(\n",
+    "    \"Yuanshi/OminiControl\",\n",
+    "    weight_name=f\"experimental/fill.safetensors\",\n",
+    "    adapter_name=\"fill\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/monalisa.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "masked_image = image.copy()\n",
+    "masked_image.paste((0, 0, 0), (128, 100, 384, 220))\n",
+    "\n",
+    "condition = Condition(\"fill\", masked_image)\n",
+    "\n",
+    "seed_everything()\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=\"The Mona Lisa is wearing a white VR headset with 'Omini' written on it.\",\n",
+    "    conditions=[condition],\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1536, 512))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(condition.condition, (512, 0))\n",
+    "concat_image.paste(result_img, (1024, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/book.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "w, h, min_dim = image.size + (min(image.size),)\n",
+    "image = image.crop(\n",
+    "    ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2)\n",
+    ").resize((512, 512))\n",
+    "\n",
+    "\n",
+    "masked_image = image.copy()\n",
+    "masked_image.paste((0, 0, 0), (150, 150, 350, 250))\n",
+    "masked_image.paste((0, 0, 0), (200, 380, 320, 420))\n",
+    "\n",
+    "condition = Condition(\"fill\", masked_image)\n",
+    "\n",
+    "seed_everything()\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=\"A yellow book with the word 'OMINI' in large font on the cover. The text 'for FLUX' appears at the bottom.\",\n",
+    "    conditions=[condition],\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1536, 512))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(condition.condition, (512, 0))\n",
+    "concat_image.paste(result_img, (1024, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

wfControl/examples/spatial.ipynb ADDED Viewed

	@@ -0,0 +1,184 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.chdir(\"..\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from diffusers.pipelines import FluxPipeline\n",
+    "from src.flux.condition import Condition\n",
+    "from PIL import Image\n",
+    "\n",
+    "from src.flux.generate import generate, seed_everything"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = FluxPipeline.from_pretrained(\n",
+    "    \"black-forest-labs/FLUX.1-dev\", torch_dtype=torch.bfloat16\n",
+    ")\n",
+    "pipe = pipe.to(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for condition_type in [\"canny\", \"depth\", \"coloring\", \"deblurring\"]:\n",
+    "    pipe.load_lora_weights(\n",
+    "        \"Yuanshi/OminiControl\",\n",
+    "        weight_name=f\"experimental/{condition_type}.safetensors\",\n",
+    "        adapter_name=condition_type,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/coffee.png\").convert(\"RGB\")\n",
+    "\n",
+    "w, h, min_dim = image.size + (min(image.size),)\n",
+    "image = image.crop(\n",
+    "    ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2)\n",
+    ").resize((512, 512))\n",
+    "\n",
+    "prompt = \"In a bright room. A cup of a coffee with some beans on the side. They are placed on a dark wooden table.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "condition = Condition(\"canny\", image)\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1536, 512))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(condition.condition, (512, 0))\n",
+    "concat_image.paste(result_img, (1024, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "condition = Condition(\"depth\", image)\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1536, 512))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(condition.condition, (512, 0))\n",
+    "concat_image.paste(result_img, (1024, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "condition = Condition(\"deblurring\", image)\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1536, 512))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(condition.condition, (512, 0))\n",
+    "concat_image.paste(result_img, (1024, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "condition = Condition(\"coloring\", image)\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1536, 512))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(condition.condition, (512, 0))\n",
+    "concat_image.paste(result_img, (1024, 0))\n",
+    "concat_image"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

wfControl/examples/subject.ipynb ADDED Viewed

	@@ -0,0 +1,214 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.chdir(\"..\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from diffusers.pipelines import FluxPipeline\n",
+    "from src.flux.condition import Condition\n",
+    "from PIL import Image\n",
+    "\n",
+    "from src.flux.generate import generate, seed_everything"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = FluxPipeline.from_pretrained(\n",
+    "    \"black-forest-labs/FLUX.1-schnell\", torch_dtype=torch.bfloat16\n",
+    ")\n",
+    "pipe = pipe.to(\"cuda\")\n",
+    "pipe.load_lora_weights(\n",
+    "    \"Yuanshi/OminiControl\",\n",
+    "    weight_name=f\"omini/subject_512.safetensors\",\n",
+    "    adapter_name=\"subject\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/penguin.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image, position_delta=(0, 32))\n",
+    "\n",
+    "prompt = \"On Christmas evening, on a crowded sidewalk, this item sits on the road, covered in snow and wearing a Christmas hat.\"\n",
+    "\n",
+    "\n",
+    "seed_everything(0)\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=512,\n",
+    "    width=512,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024, 512))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/tshirt.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image, position_delta=(0, 32))\n",
+    "\n",
+    "prompt = \"On the beach, a lady sits under a beach umbrella. She's wearing this shirt and has a big smile on her face, with her surfboard hehind her. The sun is setting in the background. The sky is a beautiful shade of orange and purple.\"\n",
+    "\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=512,\n",
+    "    width=512,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024, 512))\n",
+    "concat_image.paste(condition.condition, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/rc_car.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image, position_delta=(0, 32))\n",
+    "\n",
+    "prompt = \"A film style shot. On the moon, this item drives across the moon surface. The background is that Earth looms large in the foreground.\"\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=512,\n",
+    "    width=512,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024, 512))\n",
+    "concat_image.paste(condition.condition, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/clock.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image, position_delta=(0, 32))\n",
+    "\n",
+    "prompt = \"In a Bauhaus style room, this item is placed on a shiny glass table, with a vase of flowers next to it. In the afternoon sun, the shadows of the blinds are cast on the wall.\"\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=512,\n",
+    "    width=512,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024, 512))\n",
+    "concat_image.paste(condition.condition, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/oranges.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image, position_delta=(0, 32))\n",
+    "\n",
+    "prompt = \"A very close up view of this item. It is placed on a wooden table. The background is a dark room, the TV is on, and the screen is showing a cooking show.\"\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=512,\n",
+    "    width=512,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024, 512))\n",
+    "concat_image.paste(condition.condition, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

wfControl/examples/subject_1024.ipynb ADDED Viewed

	@@ -0,0 +1,221 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.chdir(\"..\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from diffusers.pipelines import FluxPipeline\n",
+    "from src.flux.condition import Condition\n",
+    "from PIL import Image\n",
+    "\n",
+    "from src.flux.generate import generate, seed_everything"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = FluxPipeline.from_pretrained(\n",
+    "    \"black-forest-labs/FLUX.1-schnell\", torch_dtype=torch.bfloat16\n",
+    ")\n",
+    "pipe = pipe.to(\"cuda\")\n",
+    "pipe.load_lora_weights(\n",
+    "    \"Yuanshi/OminiControl\",\n",
+    "    weight_name=f\"omini/subject_1024_beta.safetensors\",\n",
+    "    adapter_name=\"subject\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/penguin.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image)\n",
+    "\n",
+    "prompt = \"On Christmas evening, on a crowded sidewalk, this item sits on the road, covered in snow and wearing a Christmas hat.\"\n",
+    "\n",
+    "\n",
+    "seed_everything(0)\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=1024,\n",
+    "    width=1024,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024+512, 1024))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/tshirt.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image)\n",
+    "\n",
+    "prompt = \"On the beach, a lady sits under a beach umbrella. She's wearing this shirt and has a big smile on her face, with her surfboard hehind her. The sun is setting in the background. The sky is a beautiful shade of orange and purple.\"\n",
+    "\n",
+    "\n",
+    "seed_everything(0)\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=1024,\n",
+    "    width=1024,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024+512, 1024))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/rc_car.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image)\n",
+    "\n",
+    "prompt = \"A film style shot. On the moon, this item drives across the moon surface. The background is that Earth looms large in the foreground.\"\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=1024,\n",
+    "    width=1024,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024+512, 1024))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/clock.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image)\n",
+    "\n",
+    "prompt = \"In a Bauhaus style room, this item is placed on a shiny glass table, with a vase of flowers next to it. In the afternoon sun, the shadows of the blinds are cast on the wall.\"\n",
+    "\n",
+    "seed_everything(0)\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=1024,\n",
+    "    width=1024,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024+512, 1024))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"assets/oranges.jpg\").convert(\"RGB\").resize((512, 512))\n",
+    "\n",
+    "condition = Condition(\"subject\", image)\n",
+    "\n",
+    "prompt = \"A very close up view of this item. It is placed on a wooden table. The background is a dark room, the TV is on, and the screen is showing a cooking show.\"\n",
+    "\n",
+    "seed_everything()\n",
+    "\n",
+    "result_img = generate(\n",
+    "    pipe,\n",
+    "    prompt=prompt,\n",
+    "    conditions=[condition],\n",
+    "    num_inference_steps=8,\n",
+    "    height=1024,\n",
+    "    width=1024,\n",
+    ").images[0]\n",
+    "\n",
+    "concat_image = Image.new(\"RGB\", (1024+512, 1024))\n",
+    "concat_image.paste(image, (0, 0))\n",
+    "concat_image.paste(result_img, (512, 0))\n",
+    "concat_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.21"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

wfControl/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+transformers==4.30.2
+peft
+opencv-python-headless  # ✅ thay cho opencv-python
+protobuf
+sentencepiece
+gradio  # (chỉ cần nếu bạn test local với UI)

wfControl/src/flux/block.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import torch
+from typing import List, Union, Optional, Dict, Any, Callable
+from diffusers.models.attention_processor import Attention, F
+from .lora_controller import enable_lora
+def attn_forward(
+    attn: Attention,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor = None,
+    condition_latents: torch.FloatTensor = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    image_rotary_emb: Optional[torch.Tensor] = None,
+    cond_rotary_emb: Optional[torch.Tensor] = None,
+    model_config: Optional[Dict[str, Any]] = {},
+) -> torch.FloatTensor:
+    batch_size, _, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    with enable_lora(
+        (attn.to_q, attn.to_k, attn.to_v), model_config.get("latent_lora", False)
+    ):
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+    inner_dim = key.shape[-1]
+    head_dim = inner_dim // attn.heads
+    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    if attn.norm_q is not None:
+        query = attn.norm_q(query)
+    if attn.norm_k is not None:
+        key = attn.norm_k(key)
+    # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+    if encoder_hidden_states is not None:
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(
+                encoder_hidden_states_query_proj
+            )
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(
+                encoder_hidden_states_key_proj
+            )
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+    if image_rotary_emb is not None:
+        from diffusers.models.embeddings import apply_rotary_emb
+        query = apply_rotary_emb(query, image_rotary_emb)
+        key = apply_rotary_emb(key, image_rotary_emb)
+    if condition_latents is not None:
+        cond_query = attn.to_q(condition_latents)
+        cond_key = attn.to_k(condition_latents)
+        cond_value = attn.to_v(condition_latents)
+        cond_query = cond_query.view(batch_size, -1, attn.heads, head_dim).transpose(
+            1, 2
+        )
+        cond_key = cond_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        cond_value = cond_value.view(batch_size, -1, attn.heads, head_dim).transpose(
+            1, 2
+        )
+        if attn.norm_q is not None:
+            cond_query = attn.norm_q(cond_query)
+        if attn.norm_k is not None:
+            cond_key = attn.norm_k(cond_key)
+    if cond_rotary_emb is not None:
+        cond_query = apply_rotary_emb(cond_query, cond_rotary_emb)
+        cond_key = apply_rotary_emb(cond_key, cond_rotary_emb)
+    if condition_latents is not None:
+        query = torch.cat([query, cond_query], dim=2)
+        key = torch.cat([key, cond_key], dim=2)
+        value = torch.cat([value, cond_value], dim=2)
+    if not model_config.get("union_cond_attn", True):
+        # If we don't want to use the union condition attention, we need to mask the attention
+        # between the hidden states and the condition latents
+        attention_mask = torch.ones(
+            query.shape[2], key.shape[2], device=query.device, dtype=torch.bool
+        )
+        condition_n = cond_query.shape[2]
+        attention_mask[-condition_n:, :-condition_n] = False
+        attention_mask[:-condition_n, -condition_n:] = False
+    elif model_config.get("independent_condition", False):
+        attention_mask = torch.ones(
+            query.shape[2], key.shape[2], device=query.device, dtype=torch.bool
+        )
+        condition_n = cond_query.shape[2]
+        attention_mask[-condition_n:, :-condition_n] = False
+    if hasattr(attn, "c_factor"):
+        attention_mask = torch.zeros(
+            query.shape[2], key.shape[2], device=query.device, dtype=query.dtype
+        )
+        condition_n = cond_query.shape[2]
+        bias = torch.log(attn.c_factor[0])
+        attention_mask[-condition_n:, :-condition_n] = bias
+        attention_mask[:-condition_n, -condition_n:] = bias
+    hidden_states = F.scaled_dot_product_attention(
+        query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask
+    )
+    hidden_states = hidden_states.transpose(1, 2).reshape(
+        batch_size, -1, attn.heads * head_dim
+    )
+    hidden_states = hidden_states.to(query.dtype)
+    if encoder_hidden_states is not None:
+        if condition_latents is not None:
+            encoder_hidden_states, hidden_states, condition_latents = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[
+                    :, encoder_hidden_states.shape[1] : -condition_latents.shape[1]
+                ],
+                hidden_states[:, -condition_latents.shape[1] :],
+            )
+        else:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+        with enable_lora((attn.to_out[0],), model_config.get("latent_lora", False)):
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        if condition_latents is not None:
+            condition_latents = attn.to_out[0](condition_latents)
+            condition_latents = attn.to_out[1](condition_latents)
+        return (
+            (hidden_states, encoder_hidden_states, condition_latents)
+            if condition_latents is not None
+            else (hidden_states, encoder_hidden_states)
+        )
+    elif condition_latents is not None:
+        # if there are condition_latents, we need to separate the hidden_states and the condition_latents
+        hidden_states, condition_latents = (
+            hidden_states[:, : -condition_latents.shape[1]],
+            hidden_states[:, -condition_latents.shape[1] :],
+        )
+        return hidden_states, condition_latents
+    else:
+        return hidden_states
+def block_forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor,
+    condition_latents: torch.FloatTensor,
+    temb: torch.FloatTensor,
+    cond_temb: torch.FloatTensor,
+    cond_rotary_emb=None,
+    image_rotary_emb=None,
+    model_config: Optional[Dict[str, Any]] = {},
+):
+    use_cond = condition_latents is not None
+    with enable_lora((self.norm1.linear,), model_config.get("latent_lora", False)):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+            hidden_states, emb=temb
+        )
+    norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = (
+        self.norm1_context(encoder_hidden_states, emb=temb)
+    )
+    if use_cond:
+        (
+            norm_condition_latents,
+            cond_gate_msa,
+            cond_shift_mlp,
+            cond_scale_mlp,
+            cond_gate_mlp,
+        ) = self.norm1(condition_latents, emb=cond_temb)
+    # Attention.
+    result = attn_forward(
+        self.attn,
+        model_config=model_config,
+        hidden_states=norm_hidden_states,
+        encoder_hidden_states=norm_encoder_hidden_states,
+        condition_latents=norm_condition_latents if use_cond else None,
+        image_rotary_emb=image_rotary_emb,
+        cond_rotary_emb=cond_rotary_emb if use_cond else None,
+    )
+    attn_output, context_attn_output = result[:2]
+    cond_attn_output = result[2] if use_cond else None
+    # Process attention outputs for the `hidden_states`.
+    # 1. hidden_states
+    attn_output = gate_msa.unsqueeze(1) * attn_output
+    hidden_states = hidden_states + attn_output
+    # 2. encoder_hidden_states
+    context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+    encoder_hidden_states = encoder_hidden_states + context_attn_output
+    # 3. condition_latents
+    if use_cond:
+        cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
+        condition_latents = condition_latents + cond_attn_output
+        if model_config.get("add_cond_attn", False):
+            hidden_states += cond_attn_output
+    # LayerNorm + MLP.
+    # 1. hidden_states
+    norm_hidden_states = self.norm2(hidden_states)
+    norm_hidden_states = (
+        norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+    )
+    # 2. encoder_hidden_states
+    norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+    norm_encoder_hidden_states = (
+        norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+    )
+    # 3. condition_latents
+    if use_cond:
+        norm_condition_latents = self.norm2(condition_latents)
+        norm_condition_latents = (
+            norm_condition_latents * (1 + cond_scale_mlp[:, None])
+            + cond_shift_mlp[:, None]
+        )
+    # Feed-forward.
+    with enable_lora((self.ff.net[2],), model_config.get("latent_lora", False)):
+        # 1. hidden_states
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+    # 2. encoder_hidden_states
+    context_ff_output = self.ff_context(norm_encoder_hidden_states)
+    context_ff_output = c_gate_mlp.unsqueeze(1) * context_ff_output
+    # 3. condition_latents
+    if use_cond:
+        cond_ff_output = self.ff(norm_condition_latents)
+        cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
+    # Process feed-forward outputs.
+    hidden_states = hidden_states + ff_output
+    encoder_hidden_states = encoder_hidden_states + context_ff_output
+    if use_cond:
+        condition_latents = condition_latents + cond_ff_output
+    # Clip to avoid overflow.
+    if encoder_hidden_states.dtype == torch.float16:
+        encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+    return encoder_hidden_states, hidden_states, condition_latents if use_cond else None
+def single_block_forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    temb: torch.FloatTensor,
+    image_rotary_emb=None,
+    condition_latents: torch.FloatTensor = None,
+    cond_temb: torch.FloatTensor = None,
+    cond_rotary_emb=None,
+    model_config: Optional[Dict[str, Any]] = {},
+):
+    using_cond = condition_latents is not None
+    residual = hidden_states
+    with enable_lora(
+        (
+            self.norm.linear,
+            self.proj_mlp,
+        ),
+        model_config.get("latent_lora", False),
+    ):
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+    if using_cond:
+        residual_cond = condition_latents
+        norm_condition_latents, cond_gate = self.norm(condition_latents, emb=cond_temb)
+        mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_condition_latents))
+    attn_output = attn_forward(
+        self.attn,
+        model_config=model_config,
+        hidden_states=norm_hidden_states,
+        image_rotary_emb=image_rotary_emb,
+        **(
+            {
+                "condition_latents": norm_condition_latents,
+                "cond_rotary_emb": cond_rotary_emb if using_cond else None,
+            }
+            if using_cond
+            else {}
+        ),
+    )
+    if using_cond:
+        attn_output, cond_attn_output = attn_output
+    with enable_lora((self.proj_out,), model_config.get("latent_lora", False)):
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+    if using_cond:
+        condition_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
+        cond_gate = cond_gate.unsqueeze(1)
+        condition_latents = cond_gate * self.proj_out(condition_latents)
+        condition_latents = residual_cond + condition_latents
+    if hidden_states.dtype == torch.float16:
+        hidden_states = hidden_states.clip(-65504, 65504)
+    return hidden_states if not using_cond else (hidden_states, condition_latents)

wfControl/src/flux/condition.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+from typing import Optional, Union, List, Tuple
+from diffusers.pipelines import FluxPipeline
+from PIL import Image, ImageFilter
+import numpy as np
+import cv2
+from .pipeline_tools import encode_images
+condition_dict = {
+    "depth": 0,
+    "canny": 1,
+    "subject": 4,
+    "coloring": 6,
+    "deblurring": 7,
+    "depth_pred": 8,
+    "fill": 9,
+    "sr": 10,
+    "cartoon": 11,
+}
+class Condition(object):
+    def __init__(
+        self,
+        condition_type: str,
+        raw_img: Union[Image.Image, torch.Tensor] = None,
+        condition: Union[Image.Image, torch.Tensor] = None,
+        mask=None,
+        position_delta=None,
+        position_scale=1.0,
+    ) -> None:
+        self.condition_type = condition_type
+        assert raw_img is not None or condition is not None
+        if raw_img is not None:
+            self.condition = self.get_condition(condition_type, raw_img)
+        else:
+            self.condition = condition
+        self.position_delta = position_delta
+        self.position_scale = position_scale
+        # TODO: Add mask support
+        assert mask is None, "Mask not supported yet"
+    def get_condition(
+        self, condition_type: str, raw_img: Union[Image.Image, torch.Tensor]
+    ) -> Union[Image.Image, torch.Tensor]:
+        """
+        Returns the condition image.
+        """
+        if condition_type == "depth":
+            from transformers import pipeline
+            depth_pipe = pipeline(
+                task="depth-estimation",
+                model="LiheYoung/depth-anything-small-hf",
+                device="cuda",
+            )
+            source_image = raw_img.convert("RGB")
+            condition_img = depth_pipe(source_image)["depth"].convert("RGB")
+            return condition_img
+        elif condition_type == "canny":
+            img = np.array(raw_img)
+            edges = cv2.Canny(img, 100, 200)
+            edges = Image.fromarray(edges).convert("RGB")
+            return edges
+        elif condition_type == "subject":
+            return raw_img
+        elif condition_type == "coloring":
+            return raw_img.convert("L").convert("RGB")
+        elif condition_type == "deblurring":
+            condition_image = (
+                raw_img.convert("RGB")
+                .filter(ImageFilter.GaussianBlur(10))
+                .convert("RGB")
+            )
+            return condition_image
+        elif condition_type == "fill":
+            return raw_img.convert("RGB")
+        elif condition_type == "cartoon":
+            return raw_img.convert("RGB")
+        return self.condition
+    @property
+    def type_id(self) -> int:
+        """
+        Returns the type id of the condition.
+        """
+        return condition_dict[self.condition_type]
+    @classmethod
+    def get_type_id(cls, condition_type: str) -> int:
+        """
+        Returns the type id of the condition.
+        """
+        return condition_dict[condition_type]
+    def encode(
+        self, pipe: FluxPipeline, empty: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        """
+        Encodes the condition into tokens, ids and type_id.
+        """
+        if self.condition_type in [
+            "depth",
+            "canny",
+            "subject",
+            "coloring",
+            "deblurring",
+            "depth_pred",
+            "fill",
+            "sr",
+            "cartoon",
+        ]:
+            if empty:
+                # make the condition black
+                e_condition = Image.new("RGB", self.condition.size, (0, 0, 0))
+                e_condition = e_condition.convert("RGB")
+                tokens, ids = encode_images(pipe, e_condition)
+            else:
+                tokens, ids = encode_images(pipe, self.condition)
+            tokens, ids = encode_images(pipe, self.condition)
+        else:
+            raise NotImplementedError(
+                f"Condition type {self.condition_type} not implemented"
+            )
+        if self.position_delta is None and self.condition_type == "subject":
+            self.position_delta = [0, -self.condition.size[0] // 16]
+        if self.position_delta is not None:
+            ids[:, 1] += self.position_delta[0]
+            ids[:, 2] += self.position_delta[1]
+        if self.position_scale != 1.0:
+            scale_bias = (self.position_scale - 1.0) / 2
+            ids[:, 1] *= self.position_scale
+            ids[:, 2] *= self.position_scale
+            ids[:, 1] += scale_bias
+            ids[:, 2] += scale_bias
+        type_id = torch.ones_like(ids[:, :1]) * self.type_id
+        return tokens, ids, type_id

wfControl/src/flux/generate.py ADDED Viewed

	@@ -0,0 +1,321 @@

+import torch
+import yaml, os
+from diffusers.pipelines import FluxPipeline
+from typing import List, Union, Optional, Dict, Any, Callable
+from .transformer import tranformer_forward
+from .condition import Condition
+from diffusers.pipelines.flux.pipeline_flux import (
+    FluxPipelineOutput,
+    calculate_shift,
+    retrieve_timesteps,
+    np,
+)
+def get_config(config_path: str = None):
+    config_path = config_path or os.environ.get("XFL_CONFIG")
+    if not config_path:
+        return {}
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    return config
+def prepare_params(
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = 512,
+    width: Optional[int] = 512,
+    num_inference_steps: int = 28,
+    timesteps: List[int] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+    **kwargs: dict,
+):
+    return (
+        prompt,
+        prompt_2,
+        height,
+        width,
+        num_inference_steps,
+        timesteps,
+        guidance_scale,
+        num_images_per_prompt,
+        generator,
+        latents,
+        prompt_embeds,
+        pooled_prompt_embeds,
+        output_type,
+        return_dict,
+        joint_attention_kwargs,
+        callback_on_step_end,
+        callback_on_step_end_tensor_inputs,
+        max_sequence_length,
+    )
+def seed_everything(seed: int = 42):
+    torch.backends.cudnn.deterministic = True
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+@torch.no_grad()
+def generate(
+    pipeline: FluxPipeline,
+    conditions: List[Condition] = None,
+    config_path: str = None,
+    model_config: Optional[Dict[str, Any]] = {},
+    condition_scale: float = 1.0,
+    default_lora: bool = False,
+    image_guidance_scale: float = 1.0,
+    **params: dict,
+):
+    model_config = model_config or get_config(config_path).get("model", {})
+    if condition_scale != 1:
+        for name, module in pipeline.transformer.named_modules():
+            if not name.endswith(".attn"):
+                continue
+            module.c_factor = torch.ones(1, 1) * condition_scale
+    self = pipeline
+    (
+        prompt,
+        prompt_2,
+        height,
+        width,
+        num_inference_steps,
+        timesteps,
+        guidance_scale,
+        num_images_per_prompt,
+        generator,
+        latents,
+        prompt_embeds,
+        pooled_prompt_embeds,
+        output_type,
+        return_dict,
+        joint_attention_kwargs,
+        callback_on_step_end,
+        callback_on_step_end_tensor_inputs,
+        max_sequence_length,
+    ) = prepare_params(**params)
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None)
+        if self.joint_attention_kwargs is not None
+        else None
+    )
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, latent_image_ids = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    # 4.1. Prepare conditions
+    condition_latents, condition_ids, condition_type_ids = ([] for _ in range(3))
+    use_condition = conditions is not None or []
+    if use_condition:
+        assert len(conditions) <= 1, "Only one condition is supported for now."
+        if not default_lora:
+            pipeline.set_adapters(conditions[0].condition_type)
+        for condition in conditions:
+            tokens, ids, type_id = condition.encode(self)
+            condition_latents.append(tokens)  # [batch_size, token_n, token_dim]
+            condition_ids.append(ids)  # [token_n, id_dim(3)]
+            condition_type_ids.append(type_id)  # [token_n, 1]
+        condition_latents = torch.cat(condition_latents, dim=1)
+        condition_ids = torch.cat(condition_ids, dim=0)
+        condition_type_ids = torch.cat(condition_type_ids, dim=0)
+    # 5. Prepare timesteps
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.base_image_seq_len,
+        self.scheduler.config.max_image_seq_len,
+        self.scheduler.config.base_shift,
+        self.scheduler.config.max_shift,
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        timesteps,
+        sigmas,
+        mu=mu,
+    )
+    num_warmup_steps = max(
+        len(timesteps) - num_inference_steps * self.scheduler.order, 0
+    )
+    self._num_timesteps = len(timesteps)
+    # 6. Denoising loop
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            # handle guidance
+            if self.transformer.config.guidance_embeds:
+                guidance = torch.tensor([guidance_scale], device=device)
+                guidance = guidance.expand(latents.shape[0])
+            else:
+                guidance = None
+            noise_pred = tranformer_forward(
+                self.transformer,
+                model_config=model_config,
+                # Inputs of the condition (new feature)
+                condition_latents=condition_latents if use_condition else None,
+                condition_ids=condition_ids if use_condition else None,
+                condition_type_ids=condition_type_ids if use_condition else None,
+                # Inputs to the original transformer
+                hidden_states=latents,
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=latent_image_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            if image_guidance_scale != 1.0:
+                uncondition_latents = condition.encode(self, empty=True)[0]
+                unc_pred = tranformer_forward(
+                    self.transformer,
+                    model_config=model_config,
+                    # Inputs of the condition (new feature)
+                    condition_latents=uncondition_latents if use_condition else None,
+                    condition_ids=condition_ids if use_condition else None,
+                    condition_type_ids=condition_type_ids if use_condition else None,
+                    # Inputs to the original transformer
+                    hidden_states=latents,
+                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                    timestep=timestep / 1000,
+                    guidance=torch.ones_like(guidance),
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = unc_pred + image_guidance_scale * (noise_pred - unc_pred)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or (
+                (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+            ):
+                progress_bar.update()
+    if output_type == "latent":
+        image = latents
+    else:
+        latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        latents = (
+            latents / self.vae.config.scaling_factor
+        ) + self.vae.config.shift_factor
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    if condition_scale != 1:
+        for name, module in pipeline.transformer.named_modules():
+            if not name.endswith(".attn"):
+                continue
+            del module.c_factor
+    if not return_dict:
+        return (image,)
+    return FluxPipelineOutput(images=image)

wfControl/src/flux/lora_controller.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from peft.tuners.tuners_utils import BaseTunerLayer
+from typing import List, Any, Optional, Type
+class enable_lora:
+    def __init__(self, lora_modules: List[BaseTunerLayer], activated: bool) -> None:
+        self.activated: bool = activated
+        if activated:
+            return
+        self.lora_modules: List[BaseTunerLayer] = [
+            each for each in lora_modules if isinstance(each, BaseTunerLayer)
+        ]
+        self.scales = [
+            {
+                active_adapter: lora_module.scaling[active_adapter]
+                for active_adapter in lora_module.active_adapters
+            }
+            for lora_module in self.lora_modules
+        ]
+    def __enter__(self) -> None:
+        if self.activated:
+            return
+        for lora_module in self.lora_modules:
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            lora_module.scale_layer(0)
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[Any],
+    ) -> None:
+        if self.activated:
+            return
+        for i, lora_module in enumerate(self.lora_modules):
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            for active_adapter in lora_module.active_adapters:
+                lora_module.scaling[active_adapter] = self.scales[i][active_adapter]
+class set_lora_scale:
+    def __init__(self, lora_modules: List[BaseTunerLayer], scale: float) -> None:
+        self.lora_modules: List[BaseTunerLayer] = [
+            each for each in lora_modules if isinstance(each, BaseTunerLayer)
+        ]
+        self.scales = [
+            {
+                active_adapter: lora_module.scaling[active_adapter]
+                for active_adapter in lora_module.active_adapters
+            }
+            for lora_module in self.lora_modules
+        ]
+        self.scale = scale
+    def __enter__(self) -> None:
+        for lora_module in self.lora_modules:
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            lora_module.scale_layer(self.scale)
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[Any],
+    ) -> None:
+        for i, lora_module in enumerate(self.lora_modules):
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            for active_adapter in lora_module.active_adapters:
+                lora_module.scaling[active_adapter] = self.scales[i][active_adapter]

wfControl/src/flux/pipeline_tools.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from diffusers.pipelines import FluxPipeline
+from diffusers.utils import logging
+from diffusers.pipelines.flux.pipeline_flux import logger
+from torch import Tensor
+def encode_images(pipeline: FluxPipeline, images: Tensor):
+    images = pipeline.image_processor.preprocess(images)
+    images = images.to(pipeline.device).to(pipeline.dtype)
+    images = pipeline.vae.encode(images).latent_dist.sample()
+    images = (
+        images - pipeline.vae.config.shift_factor
+    ) * pipeline.vae.config.scaling_factor
+    images_tokens = pipeline._pack_latents(images, *images.shape)
+    images_ids = pipeline._prepare_latent_image_ids(
+        images.shape[0],
+        images.shape[2],
+        images.shape[3],
+        pipeline.device,
+        pipeline.dtype,
+    )
+    if images_tokens.shape[1] != images_ids.shape[0]:
+        images_ids = pipeline._prepare_latent_image_ids(
+            images.shape[0],
+            images.shape[2] // 2,
+            images.shape[3] // 2,
+            pipeline.device,
+            pipeline.dtype,
+        )
+    return images_tokens, images_ids
+def prepare_text_input(pipeline: FluxPipeline, prompts, max_sequence_length=512):
+    # Turn off warnings (CLIP overflow)
+    logger.setLevel(logging.ERROR)
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = pipeline.encode_prompt(
+        prompt=prompts,
+        prompt_2=None,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        device=pipeline.device,
+        num_images_per_prompt=1,
+        max_sequence_length=max_sequence_length,
+        lora_scale=None,
+    )
+    # Turn on warnings
+    logger.setLevel(logging.WARNING)
+    return prompt_embeds, pooled_prompt_embeds, text_ids