Spaces:

WithAnyone
/

WithAnyone_demo

Running on Zero

App Files Files Community

WithAnyone commited on 14 days ago

Commit

4910a8a

verified ·

1 Parent(s): db24270

Upload 29 files

Browse files

Files changed (27) hide show

.gitignore +232 -0
.gitmodules +0 -0
LICENSE +54 -0
app.py +586 -0
gradio_app.py +568 -0
gradio_edit.py +563 -0
infer_withanyone.py +309 -0
nohup.out +2 -0
requirements.txt +24 -0
util.py +411 -0
withanyone/flux/__pycache__/math.cpython-310.pyc +0 -0
withanyone/flux/__pycache__/model.cpython-310.pyc +0 -0
withanyone/flux/__pycache__/pipeline.cpython-310.pyc +0 -0
withanyone/flux/__pycache__/sampling.cpython-310.pyc +0 -0
withanyone/flux/__pycache__/util.cpython-310.pyc +0 -0
withanyone/flux/math.py +49 -0
withanyone/flux/model.py +610 -0
withanyone/flux/modules/__pycache__/autoencoder.cpython-310.pyc +0 -0
withanyone/flux/modules/__pycache__/conditioner.cpython-310.pyc +0 -0
withanyone/flux/modules/__pycache__/layers.cpython-310.pyc +0 -0
withanyone/flux/modules/autoencoder.py +327 -0
withanyone/flux/modules/conditioner.py +53 -0
withanyone/flux/modules/layers.py +530 -0
withanyone/flux/pipeline.py +406 -0
withanyone/flux/sampling.py +171 -0
withanyone/flux/util.py +518 -0
withanyone/utils/convert_yaml_to_args_file.py +22 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,232 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# User config files
+.vscode/
+output/
+# ckpt
+*.bin
+*.pt
+*.pth
+ckpts/
+ckpt-*
+ckpts/*
+# legacy code
+legacy/
+legacy/*
+# wandb
+wandb/
+wandb/*
+# arcface models
+models/
+# debug
+debug*
+data_single/
+data_single_10/
+lora_attampt/
+lora_attampt/*
+*.safetensors
+*.ckpt
+.output/
+for_bbox/
+# data
+data/
+datasets/
+nohup.out
+10**
+temp_generated.png
+facenet_pytorch/
+facenet_pytorch/*
+# AdaFace/
+# AdaFace/*
+pretrained/
+git_backup/
+git_backup/*

.gitmodules ADDED Viewed

File without changes

LICENSE ADDED Viewed

	@@ -0,0 +1,54 @@

+FLUX.1 [dev] Non-Commercial License v1.1.1
+Black Forest Labs Inc. (“we” or “our” or “Company”) is pleased to make available the weights, parameters and inference code for the FLUX.1 [dev] Model (as defined below) freely available for your non-commercial and non-production use as set forth in this FLUX.1 [dev] Non-Commercial License (“License”).  The “FLUX.1 [dev] Model” means the FLUX.1 [dev] AI models and models denoted as FLUX.1 [dev], including but not limited to FLUX.1 [dev], FLUX.1 Fill [dev], FLUX.1 Depth [dev], FLUX.1 Canny [dev], FLUX.1 Redux [dev], FLUX.1 Canny [dev] LoRA, FLUX.1 Depth [dev] LoRA, and FLUX.1 Kontext [dev], and their elements which includes algorithms, software, checkpoints, parameters, source code (inference code, evaluation code, and if applicable, fine-tuning code) and any other materials associated with the FLUX.1 [dev] AI models made available by Company under this License, including if any, the technical documentation, manuals and instructions for the use and operation thereof (collectively, “FLUX.1 [dev] Model”). Note that we may also make available certain elements of what is included in the definition of “FLUX.1 [dev] Model” under a separate license, such as the inference code, and nothing in this License will be deemed to restrict or limit any other licenses granted by us in such elements.
+By downloading, accessing, using, Distributing (as defined below), or creating a Derivative (as defined below) of the FLUX.1 [dev] Model, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to access, use, Distribute or create a Derivative of the FLUX.1 [dev] Model and you must immediately cease using the FLUX.1 [dev] Model. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to us that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the FLUX.1 [dev] Model on behalf of your employer or other entity.
+1. Definitions.
+- a. “Derivative”  means any (i) modified version of the FLUX.1 [dev] Model (including but not limited to any customized or fine-tuned version thereof), (ii) work based on the FLUX.1 [dev] Model, or (iii) any other derivative work thereof. For the avoidance of doubt, Outputs are not considered Derivatives under this License.
+- b. “Distribution” or “Distribute” or “Distributing” means providing or making available, by any means, a copy of the FLUX.1 [dev] Models and/or the Derivatives as the case may be.
+- c. “Non-Commercial Purpose” means any of the following uses, but only so far as you do not receive any direct or indirect payment arising from the use of the FLUX.1 [dev] Model, Derivatives, or FLUX Content Filters (as defined below): (i) personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, or otherwise not directly or indirectly connected to any commercial activities, business operations, or employment responsibilities; (ii) use by commercial or for-profit entities for testing, evaluation, or non-commercial research and development in a non-production environment; and (iii) use by any charitable organization for charitable purposes, or for testing or evaluation. For clarity, use (a) for revenue-generating activity, (b) in direct interactions with or that has impact on end users, or (c) to train, fine tune or distill other models for commercial use, in each case is not a Non-Commercial Purpose.
+- d. “Outputs” means any content generated by the operation of the FLUX.1 [dev] Models or the Derivatives from an input (such as an image input) or prompt (i.e., text instructions) provided by users. For the avoidance of doubt, Outputs do not include any components of the FLUX.1 [dev] Models, such as any fine-tuned versions of the FLUX.1 [dev] Models, the weights, or parameters.
+- e.   “you” or “your” means the individual or entity entering into this License with Company.
+2. License Grant.
+- a. License. Subject to your compliance with this License, Company grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license to access, use, create Derivatives of, and Distribute the FLUX.1 [dev] Models and Derivatives solely for your Non-Commercial Purposes. The foregoing license is personal to you, and you may not assign or sublicense this License or any other rights or obligations under this License without Company’s prior written consent; any such assignment or sublicense will be void and will automatically and immediately terminate this License.  Any restrictions set forth herein regarding the FLUX.1 [dev] Model also apply to any Derivative you create or that are created on your behalf.
+- b. Non-Commercial Use Only.  You may only access, use, Distribute, or create Derivatives of the FLUX.1 [dev] Model or Derivatives for Non-Commercial Purposes.  If you want to use a FLUX.1 [dev] Model or a Derivative for any purpose that is not expressly authorized under this License, such as for a commercial activity, you must request a license from Company, which Company may grant to you in Company’s sole discretion and which additional use may be subject to a fee, royalty or other revenue share. Please see www.bfl.ai if you would like a commercial license.
+- c. Reserved Rights. The grant of rights expressly set forth in this License are the complete grant of rights to you in the FLUX.1 [dev] Model, and no other licenses are granted, whether by waiver, estoppel, implication, equity or otherwise. Company and its licensors reserve all rights not expressly granted by this License.
+- d. Outputs. We claim no ownership rights in and to the Outputs. You are solely responsible for the Outputs you generate and their subsequent uses in accordance with this License. You may use Output for any purpose (including for commercial purposes), except as expressly prohibited herein.  You may not use the Output to train, fine-tune or distill a model that is competitive with the FLUX.1 [dev] Model or the FLUX.1 Kontext [dev] Model.
+- e. You may access, use, Distribute, or create Output of the FLUX.1 [dev] Model or Derivatives if you: (i) (A) implement and maintain content filtering measures (“Content Filters”) for your use of the FLUX.1 [dev] Model or Derivatives to prevent the creation, display, transmission, generation, or dissemination of unlawful or infringing content, which may include Content Filters that we may make available for use with the FLUX.1 [dev] Model (“FLUX Content Filters”), or (B) ensure Output undergoes review for unlawful or infringing content before public or non-public distribution, display, transmission or dissemination; and (ii) ensure Output includes disclosure (or other indication) that the Output was generated or modified using artificial intelligence technologies to the extent required under applicable law.
+3. Distribution. Subject to this License, you may Distribute copies of the FLUX.1 [dev] Model and/or Derivatives made by you, under the following conditions:
+- a. you must make available a copy of this License to third-party recipients of the FLUX.1 [dev] Models and/or Derivatives you Distribute, and specify that any rights to use the FLUX.1 [dev] Models and/or Derivatives shall be directly granted by Company to said third-party recipients pursuant to this License;
+- b. you must prominently display the following notice alongside the Distribution of the FLUX.1 [dev] Model or Derivative (such as via a “Notice” text file distributed as part of such FLUX.1 [dev] Model or Derivative) (the “Attribution Notice”):
+    “The FLUX.1 [dev] Model is licensed by Black Forest Labs Inc. under the FLUX.1 [dev] Non-Commercial License. Copyright Black Forest Labs Inc.
+    IN NO EVENT SHALL BLACK FOREST LABS INC. BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH USE OF THIS MODEL.”
+- c. in the case of Distribution of Derivatives made by you: (i) you must also include in the Attribution Notice a statement that you have modified the applicable FLUX.1 [dev] Model; (ii) any terms and conditions you impose on any third-party recipients relating to Derivatives made by or for you shall neither limit such third-party recipients’ use of the FLUX.1 [dev] Model or any Derivatives made by or for Company in accordance with this License nor conflict with any of its terms and conditions and must include disclaimer of warranties and limitation of liability provisions that are at least as protective of Company as those set forth herein; and (iii) you must not misrepresent or imply, through any means, that the Derivatives made by or for you and/or any modified version of the FLUX.1 [dev] Model you Distribute under your name and responsibility is an official product of the Company or has been endorsed, approved or validated by the Company, unless you are authorized by Company to do so in writing.
+4. Restrictions.  You will not, and will not permit, assist or cause any third party to
+- a. use, modify, copy, reproduce, create Derivatives of, or Distribute the FLUX.1 [dev] Model (or any Derivative thereof, or any data produced by the FLUX.1 [dev] Model), in whole or in part, (i) for any commercial or production purposes, (ii) military purposes, (iii) purposes of surveillance, including any research or development relating to surveillance, (iv) biometric processing, (v) in any manner that infringes, misappropriates, or otherwise violates (or is likely to infringe, misappropriate, or otherwise violate) any third party’s legal rights, including rights of publicity or “digital replica” rights, (vi) in any unlawful, fraudulent, defamatory, or abusive activity, (vii) to generate unlawful content, including child sexual abuse material, or non-consensual intimate images;  or (viii) in any manner that violates any applicable law and violating any privacy or security laws, rules, regulations, directives, or governmental requirements (including the General Data Privacy Regulation (Regulation (EU) 2016/679), the California Consumer Privacy Act, any and all laws governing the processing of biometric information, and the EU Artificial Intelligence Act (Regulation (EU) 2024/1689), as well as all amendments and successor laws to any of the foregoing;
+- b. alter or remove copyright and other proprietary notices which appear on or in any portion of the FLUX.1 [dev] Model;
+- c. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Company in connection with the FLUX.1 [dev] Model, or to circumvent or remove any usage restrictions, or to enable functionality disabled by FLUX.1 [dev] Model;
+- d. offer or impose any terms on the FLUX.1 [dev] Model that alter, restrict, or are inconsistent with the terms of this License;
+- e. violate any applicable U.S. and non-U.S. export control and trade sanctions laws (“Export Laws”) in connection with your use or Distribution of any FLUX.1 [dev] Model;
+- f. directly or indirectly Distribute, export, or otherwise transfer FLUX.1 [dev] Model  (i) to any individual, entity, or country prohibited by Export Laws; (ii) to anyone on U.S. or non-U.S. government restricted parties lists; (iii) for any purpose prohibited by Export Laws, including nuclear, chemical or biological weapons, or missile technology applications; (iv) use or download FLUX.1 [dev] Model if you or they are (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) for any purpose prohibited by Export Laws; and (v) will not disguise your location through IP proxying or other methods.
+5. DISCLAIMERS.  THE FLUX.1 [dev] MODEL AND FLUX CONTENT FILTERS ARE PROVIDED “AS IS” AND “WITH ALL FAULTS” WITH NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. COMPANY EXPRESSLY DISCLAIMS ALL REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM, USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE FLUX.1 [dev] MODEL AND FLUX CONTENT FILTERS, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT. COMPANY MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE FLUX.1 [dev] MODEL AND FLUX CONTENT FILTERS WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY PARTICULAR RESULTS.
+6. LIMITATION OF LIABILITY.  TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL COMPANY BE LIABLE TO YOU OR YOUR EMPLOYEES, AFFILIATES, USERS, OFFICERS OR DIRECTORS (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF COMPANY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THE FLUX.1 [dev] MODEL, ITS CONSTITUENT COMPONENTS, FLUX CONTENT FILTERS, AND ANY OUTPUT (COLLECTIVELY, “MODEL MATERIALS”) ARE NOT DESIGNED OR INTENDED FOR USE IN ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE MODEL MATERIALS COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A “HIGH-RISK USE”). IF YOU ELECT TO USE ANY OF THE MODEL MATERIALS FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT IN ANY OF THE MODEL MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR THE FIELD OF THE HIGH-RISK USE.
+7. INDEMNIFICATION. You will indemnify, defend and hold harmless Company and our subsidiaries and affiliates, and each of our respective shareholders, directors, officers, employees, agents, successors, and assigns (collectively, the “Company Parties”) from and against any losses, liabilities, damages, fines, penalties, and expenses (including reasonable attorneys’ fees) incurred by any Company Party in connection with any claim, demand, allegation, lawsuit, proceeding, or investigation (collectively, “Claims”) arising out of or related to  (a) your access to or use of the FLUX.1 [dev] Model (including in connection with any Output, results or data generated from such access or use, or from your access or use of any FLUX Content Filters), including any High-Risk Use; (b) your Content Filters, including your failure to implement any Content Filters where required by this License such as in Section 2(e); (c)  your violation of this License; or (d) your violation, misappropriation or infringement of any rights of another (including intellectual property or other proprietary rights and privacy rights). You will promptly notify the Company Parties of any such Claims, and cooperate with Company Parties in defending such Claims. You will also grant the Company Parties sole control of the defense or settlement, at Company’s sole option, of any Claims. This indemnity is in addition to, and not in lieu of, any other indemnities or remedies set forth in a written agreement between you and Company or the other Company Parties.
+8. Termination; Survival.
+- a. This License will automatically terminate upon any breach by you of the terms of this License.
+- b. We may terminate this License, in whole or in part, at any time upon notice (including electronic) to you.
+- c. If you initiate any legal action or proceedings against Company or any other entity (including a cross-claim or counterclaim in a lawsuit), alleging that the FLUX.1 [dev] Model, any Derivative, or FLUX Content Filters, or any part thereof, infringe upon intellectual property or other rights owned or licensable by you, then any licenses granted to you under this License will immediately terminate as of the date such legal action or claim is filed or initiated.
+- d. Upon termination of this License, you must cease all use, access or Distribution of the FLUX.1 [dev] Model, any Derivatives, and any FLUX Content Filters.  The following sections survive termination of this License  2(c), 2(d), 4-11.
+9. Third Party Materials. The FLUX.1 [dev] Model may contain third-party software or other components (including free and open source software) (all of the foregoing, “Third Party Materials”), which are subject to the license terms of the respective third-party licensors. Your dealings or correspondence with third parties and your use of or interaction with any Third Party Materials are solely between you and the third party. Company does not control or endorse, and makes no representations or warranties regarding, any Third Party Materials, and your access to and use of such Third Party Materials are at your own risk.
+10. Trademarks. You have not been granted any trademark license as part of this License and may not use any name, logo or trademark associated with Company without the prior written permission of Company, except to the extent necessary to make the reference required in the Attribution Notice as specified above or as is reasonably necessary in describing the FLUX.1 [dev] Model and its creators.
+11. General. This License will be governed and construed under the laws of the State of Delaware without regard to conflicts of law provisions. If any provision or part of a provision of this License is unlawful, void or unenforceable, that provision or part of the provision is deemed severed from this License, and will not affect the validity and enforceability of any remaining provisions. The failure of Company to exercise or enforce any right or provision of this License will not operate as a waiver of such right or provision. This License does not confer any third-party beneficiary rights upon any other person or entity. This License, together with the documentation, contains the entire understanding between you and Company regarding the subject matter of this License, and supersedes all other written or oral agreements and understandings between you and Company regarding such subject matter.

app.py ADDED Viewed

	@@ -0,0 +1,586 @@

+# Copyright (c) 2025 Fudan University. All rights reserved.
+import dataclasses
+import json
+import os
+from pathlib import Path
+from typing import List, Literal, Optional
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image, ImageDraw
+from withanyone.flux.pipeline import WithAnyonePipeline
+from util import extract_moref, face_preserving_resize
+import insightface
+def captioner(prompt: str, num_person = 1) -> List[List[float]]:
+    # use random choose for testing
+    # within 512
+    if num_person == 1:
+        bbox_choices = [
+            # expanded, centered and quadrant placements
+            [96, 96, 288, 288],
+            [128, 128, 320, 320],
+            [160, 96, 352, 288],
+            [96, 160, 288, 352],
+            [208, 96, 400, 288],
+            [96, 208, 288, 400],
+            [192, 160, 368, 336],
+            [64, 128, 224, 320],
+            [288, 128, 448, 320],
+            [128, 256, 320, 448],
+            [80, 80, 240, 272],
+            [196, 196, 380, 380],
+            # originals
+            [100, 100, 300, 300],
+            [150, 50, 450, 350],
+            [200, 100, 500, 400],
+            [250, 150, 512, 450],
+        ]
+        return [bbox_choices[np.random.randint(0, len(bbox_choices))]]
+    elif num_person == 2:
+        # realistic side-by-side rows (no vertical stacks or diagonals)
+        bbox_choices = [
+            [[64, 112, 224, 304], [288, 112, 448, 304]],
+            [[48, 128, 208, 320], [304, 128, 464, 320]],
+            [[32, 144, 192, 336], [320, 144, 480, 336]],
+            [[80, 96, 240, 288], [272, 96, 432, 288]],
+            [[80, 160, 240, 352], [272, 160, 432, 352]],
+            [[64, 128, 240, 336], [272, 144, 432, 320]],  # slight stagger, same row
+            [[96, 160, 256, 352], [288, 160, 448, 352]],
+            [[64, 192, 224, 384], [288, 192, 448, 384]],  # lower row
+            [[16, 128, 176, 320], [336, 128, 496, 320]],  # near edges
+            [[48, 120, 232, 328], [280, 120, 464, 328]],
+            [[96, 160, 240, 336], [272, 160, 416, 336]],  # tighter faces
+            [[72, 136, 232, 328], [280, 152, 440, 344]],  # small vertical offset
+            [[48, 120, 224, 344], [288, 144, 448, 336]],  # asymmetric sizes
+            [[80, 224, 240, 416], [272, 224, 432, 416]],  # bottom row
+            [[80, 64, 240, 256], [272, 64, 432, 256]],    # top row
+            [[96, 176, 256, 368], [288, 176, 448, 368]],
+        ]
+        return bbox_choices[np.random.randint(0, len(bbox_choices))]
+    elif num_person == 3:
+        # Non-overlapping 3-person layouts within 512x512
+        bbox_choices = [
+            [[20, 140, 150, 360], [180, 120, 330, 360], [360, 130, 500, 360]],
+            [[30, 100, 160, 300], [190, 90, 320, 290], [350, 110, 480, 310]],
+            [[40, 180, 150, 330], [200, 180, 310, 330], [360, 180, 470, 330]],
+            [[60, 120, 170, 300], [210, 110, 320, 290], [350, 140, 480, 320]],
+            [[50, 80, 170, 250], [200, 130, 320, 300], [350, 80, 480, 250]],
+            [[40, 260, 170, 480], [190, 60, 320, 240], [350, 260, 490, 480]],
+            [[30, 120, 150, 320], [200, 140, 320, 340], [360, 160, 500, 360]],
+            [[80, 140, 200, 300], [220, 80, 350, 260], [370, 160, 500, 320]],
+        ]
+        return bbox_choices[np.random.randint(0, len(bbox_choices))]
+    elif num_person == 4:
+        # Non-overlapping 4-person layouts within 512x512
+        bbox_choices = [
+            [[20, 100, 120, 240], [140, 100, 240, 240], [260, 100, 360, 240], [380, 100, 480, 240]],
+            [[40, 60, 200, 260], [220, 60, 380, 260], [40, 280, 200, 480], [220, 280, 380, 480]],
+            [[180, 30, 330, 170], [30, 220, 150, 380], [200, 220, 320, 380], [360, 220, 490, 380]],
+            [[30, 60, 140, 200], [370, 60, 480, 200], [30, 320, 140, 460], [370, 320, 480, 460]],
+            [[20, 120, 120, 380], [140, 100, 240, 360], [260, 120, 360, 380], [380, 100, 480, 360]],
+            [[30, 80, 150, 240], [180, 120, 300, 280], [330, 80, 450, 240], [200, 300, 320, 460]],
+            [[30, 140, 110, 330], [140, 140, 220, 330], [250, 140, 330, 330], [370, 140, 450, 330]],
+            [[40, 80, 150, 240], [40, 260, 150, 420], [200, 80, 310, 240], [370, 80, 480, 240]],
+        ]
+        return bbox_choices[np.random.randint(0, len(bbox_choices))]
+class FaceExtractor:
+    def __init__(self, model_path="./"):
+        try:
+            self.model = insightface.app.FaceAnalysis(name = "antelopev2", root=model_path, providers=['CUDAExecutionProvider'])
+        except Exception as e:
+            print(f"Error loading insightface model: {e}. There might be an issue with the directory structure. Trying to fix it...")
+            antelopev2_nested_path = os.path.join(model_path, "models", "antelopev2", "antelopev2")
+            print(f"Checking for nested path: {antelopev2_nested_path}")
+            if os.path.exists(antelopev2_nested_path):
+                import subprocess
+                print("Detected nested antelopev2 directory, fixing directory structure...")
+                # Change to the model_path directory to execute commands
+                current_dir = os.getcwd()
+                os.chdir(model_path)
+                # Execute the commands as specified by the user
+                subprocess.run(["mv", "models/antelopev2/", "models/antelopev2_"])
+                subprocess.run(["mv", "models/antelopev2_/antelopev2/", "models/antelopev2/"])
+                # Return to the original directory
+                os.chdir(current_dir)
+                print("Directory structure fixed.")
+        self.model = insightface.app.FaceAnalysis(name="antelopev2", root="./")
+        self.model.prepare(ctx_id=0)
+    def extract(self, image: Image.Image):
+        """Extract single face and embedding from an image"""
+        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        res = self.model.get(image_np)
+        if len(res) == 0:
+            return None, None
+        res = res[0]
+        bbox = res["bbox"]
+        moref = extract_moref(image, {"bboxes": [bbox]}, 1)
+        return moref[0], res["embedding"]
+    def extract_refs(self, image: Image.Image):
+        """Extract multiple faces and embeddings from an image"""
+        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        res = self.model.get(image_np)
+        if len(res) == 0:
+            return None, None, None
+        ref_imgs = []
+        arcface_embeddings = []
+        bboxes = []
+        for r in res:
+            bbox = r["bbox"]
+            bboxes.append(bbox)
+            moref = extract_moref(image, {"bboxes": [bbox]}, 1)
+            ref_imgs.append(moref[0])
+            arcface_embeddings.append(r["embedding"])
+        # Convert bboxes to the correct format
+        new_img, new_bboxes = face_preserving_resize(image, bboxes, 512)
+        return ref_imgs, arcface_embeddings, new_bboxes, new_img
+def resize_bbox(bbox, ori_width, ori_height, new_width, new_height):
+    """Resize bounding box coordinates while preserving aspect ratio"""
+    x1, y1, x2, y2 = bbox
+    # Calculate scaling factors
+    width_scale = new_width / ori_width
+    height_scale = new_height / ori_height
+    # Use minimum scaling factor to preserve aspect ratio
+    min_scale = min(width_scale, height_scale)
+    # Calculate offsets for centering the scaled box
+    width_offset = (new_width - ori_width * min_scale) / 2
+    height_offset = (new_height - ori_height * min_scale) / 2
+    # Scale and adjust coordinates
+    new_x1 = int(x1 * min_scale + width_offset)
+    new_y1 = int(y1 * min_scale + height_offset)
+    new_x2 = int(x2 * min_scale + width_offset)
+    new_y2 = int(y2 * min_scale + height_offset)
+    return [new_x1, new_y1, new_x2, new_y2]
+def draw_bboxes_on_image(image, bboxes):
+    """Draw bounding boxes on image for visualization"""
+    if bboxes is None:
+        return image
+    # Create a copy to draw on
+    img_draw = image.copy()
+    draw = ImageDraw.Draw(img_draw)
+    # Draw each bbox with a different color
+    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]
+    for i, bbox in enumerate(bboxes):
+        color = colors[i % len(colors)]
+        x1, y1, x2, y2 = [int(coord) for coord in bbox]
+        # Draw rectangle
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
+        # Draw label
+        draw.text((x1, y1-15), f"Face {i+1}", fill=color)
+    return img_draw
+def create_demo(
+    model_type: str = "flux-dev",
+    ipa_path: str = "./ckpt/ipa.safetensors",
+    device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    offload: bool = False,
+    lora_rank: int = 64,
+    additional_lora_ckpt: Optional[str] = None,
+    lora_scale: float = 1.0,
+    clip_path: str = "openai/clip-vit-large-patch14",
+    t5_path: str = "xlabs-ai/xflux_text_encoders",
+    flux_path: str = "black-forest-labs/FLUX.1-dev",
+):
+    face_extractor = FaceExtractor()
+    # Initialize pipeline and face extractor
+    pipeline = WithAnyonePipeline(
+        model_type,
+        ipa_path,
+        device,
+        offload,
+        only_lora=True,
+        no_lora=True,
+        lora_rank=lora_rank,
+        additional_lora_ckpt=additional_lora_ckpt,
+        lora_weight=lora_scale,
+        face_extractor=face_extractor,
+        clip_path=clip_path,
+        t5_path=t5_path,
+        flux_path=flux_path,
+    )
+    # Add project badges
+    # badges_text = r"""
+    # <div style="text-align: center; display: flex; justify-content: left; gap: 5px;">
+    # <a href="https://github.com/bytedance/UNO"><img alt="Build" src="https://img.shields.io/github/stars/bytedance/UNO"></a>
+    # <a href="https://bytedance.github.io/UNO/"><img alt="Build" src="https://img.shields.io/badge/Project%20Page-UNO-yellow"></a>
+    # <a href="https://arxiv.org/abs/2504.02160"><img alt="Build" src="https://img.shields.io/badge/arXiv%20paper-UNO-b31b1b.svg"></a>
+    # </div>
+    # """.strip()
+    def parse_bboxes(bbox_text):
+        """Parse bounding box text input"""
+        if not bbox_text or bbox_text.strip() == "":
+            return None
+        try:
+            bboxes = []
+            lines = bbox_text.strip().split("\n")
+            for line in lines:
+                if not line.strip():
+                    continue
+                coords = [float(x) for x in line.strip().split(",")]
+                if len(coords) != 4:
+                    raise ValueError(f"Each bbox must have 4 coordinates (x1,y1,x2,y2), got: {line}")
+                bboxes.append(coords)
+            # print(f"\nParsed bboxes: {bboxes}\n")
+            return bboxes
+        except Exception as e:
+            raise gr.Error(f"Invalid bbox format: {e}")
+    def extract_from_multi_person(multi_person_image):
+        """Extract references and bboxes from a multi-person image"""
+        if multi_person_image is None:
+            return None, None, None, None
+        # Convert from numpy to PIL if needed
+        if isinstance(multi_person_image, np.ndarray):
+            multi_person_image = Image.fromarray(multi_person_image)
+        ref_imgs, arcface_embeddings, bboxes, new_img = face_extractor.extract_refs(multi_person_image)
+        if ref_imgs is None or len(ref_imgs) == 0:
+            raise gr.Error("No faces detected in the multi-person image")
+        # Limit to max 4 faces
+        ref_imgs = ref_imgs[:4]
+        arcface_embeddings = arcface_embeddings[:4]
+        bboxes = bboxes[:4]
+        # Create visualization with bboxes
+        viz_image = draw_bboxes_on_image(new_img, bboxes)
+        # Format bboxes as string for display
+        bbox_text = "\n".join([f"{bbox[0]:.1f},{bbox[1]:.1f},{bbox[2]:.1f},{bbox[3]:.1f}" for bbox in bboxes])
+        return ref_imgs, arcface_embeddings, bboxes, viz_image
+    def process_and_generate(
+        prompt,
+        width, height,
+        guidance, num_steps, seed,
+        ref_img1, ref_img2, ref_img3, ref_img4,
+        manual_bboxes_text,
+        multi_person_image,
+        # use_text_prompt,
+        # id_weight,
+        siglip_weight
+    ):
+        # Collect and validate reference images
+        ref_images = [img for img in [ref_img1, ref_img2, ref_img3, ref_img4] if img is not None]
+        if not ref_images:
+            raise gr.Error("At least one reference image is required")
+        # Process reference images to extract face and embeddings
+        ref_imgs = []
+        arcface_embeddings = []
+        # Modified bbox handling logic
+        if multi_person_image is not None:
+            # Extract from multi-person image mode
+            extracted_refs, extracted_embeddings, bboxes_, _ = extract_from_multi_person(multi_person_image)
+            if extracted_refs is None:
+                raise gr.Error("Failed to extract faces from the multi-person image")
+            print("bboxes from multi-person image:", bboxes_)
+            # need to resize bboxes from 512 512 to width height
+            bboxes_ = [resize_bbox(bbox, 512, 512, width, height) for bbox in bboxes_]
+        else:
+            # Parse manual bboxes
+            bboxes_ = parse_bboxes(manual_bboxes_text)
+            # If no manual bboxes provided, use automatic captioner
+            if bboxes_ is None:
+                print("No multi-person image or manual bboxes provided. Using automatic captioner.")
+                # Generate automatic bboxes based on image dimensions
+                bboxes__ = captioner(prompt, num_person=len(ref_images))
+                # resize to width height
+                bboxes_ = [resize_bbox(bbox, 512, 512, width, height) for bbox in bboxes__]
+                print("Automatically generated bboxes:", bboxes_)
+        bboxes = [bboxes_]  # 伪装batch输入
+        # else:
+            # Manual mode: process each reference image
+        for img in ref_images:
+            if isinstance(img, np.ndarray):
+                img = Image.fromarray(img)
+            ref_img, embedding = face_extractor.extract(img)
+            if ref_img is None or embedding is None:
+                raise gr.Error("Failed to extract face from one of the reference images")
+            ref_imgs.append(ref_img)
+            arcface_embeddings.append(embedding)
+        # pad arcface_embeddings to 4 if less than 4
+        # while len(arcface_embeddings) < 4:
+        #     arcface_embeddings.append(np.zeros_like(arcface_embeddings[0]))
+        if bboxes is None:
+            raise gr.Error("Either provide manual bboxes or a multi-person image for bbox extraction")
+        if len(bboxes[0]) != len(ref_imgs):
+            raise gr.Error(f"Number of bboxes ({len(bboxes[0])}) must match number of reference images ({len(ref_imgs)})")
+        # Convert arcface embeddings to tensor
+        arcface_embeddings = [torch.tensor(embedding) for embedding in arcface_embeddings]
+        arcface_embeddings = torch.stack(arcface_embeddings).to(device)
+        # Generate image
+        final_prompt = prompt
+        print(f"Generating image of size {width}x{height} with bboxes: {bboxes} ")
+        if seed < 0:
+            seed = np.random.randint(0, 1000000)
+        image_gen = pipeline(
+            prompt=final_prompt,
+            width=width,
+            height=height,
+            guidance=guidance,
+            num_steps=num_steps,
+            seed=seed if seed > 0 else None,
+            ref_imgs=ref_imgs,
+            arcface_embeddings=arcface_embeddings,
+            bboxes=bboxes,
+            id_weight = 1 - siglip_weight,
+            siglip_weight=siglip_weight,
+        )
+        # Save temp file for download
+        temp_path = "temp_generated.png"
+        image_gen.save(temp_path)
+        # draw bboxes on the generated image for debug
+        debug_face = draw_bboxes_on_image(image_gen, bboxes[0])
+        return image_gen, debug_face, temp_path
+    def update_bbox_display(multi_person_image):
+        if multi_person_image is None:
+            return None, gr.update(visible=True), gr.update(visible=False)
+        try:
+            _, _, _, viz_image = extract_from_multi_person(multi_person_image)
+            return viz_image, gr.update(visible=False), gr.update(visible=True)
+        except Exception as e:
+            return None, gr.update(visible=True), gr.update(visible=False)
+    # Create Gradio interface
+    with gr.Blocks() as demo:
+        gr.Markdown("# WithAnyone Demo")
+        # gr.Markdown(badges_text)
+        with gr.Row():
+            with gr.Column():
+                # Input controls
+                generate_btn = gr.Button("Generate", variant="primary")
+                with gr.Row():
+                    with gr.Column():
+                        siglip_weight = gr.Slider(0.0, 1.0, 1.0, step=0.05, label="Spiritual Resemblance <--> Formal Resemblance")
+                with gr.Row():
+                    prompt = gr.Textbox(label="Prompt", value="a person in a beautiful garden. High resolution, extremely detailed")
+                    # use_text_prompt = gr.Checkbox(label="Use text prompt", value=True)
+                with gr.Row():
+                    # Image generation settings
+                    with gr.Column():
+                        width = gr.Slider(512, 1024, 768, step=64, label="Generation Width")
+                        height = gr.Slider(512, 1024, 768, step=64, label="Generation Height")
+                with gr.Accordion("Advanced Options", open=False):
+                    with gr.Row():
+                        num_steps = gr.Slider(1, 50, 25, step=1, label="Number of steps")
+                        guidance = gr.Slider(1.0, 10.0, 4.0, step=0.1, label="Guidance")
+                        seed = gr.Number(-1, label="Seed (-1 for random)")
+                        # start_at = gr.Slider(0, 50, 0, step=1, label="Start Identity at Step")
+                        # end_at = gr.Number(-1, label="End Identity at Step (-1 for last)")
+                    # with gr.Row():
+                    #     # skip_every = gr.Number(-1, label="Skip Identity Every N Steps (-1 for no skip)")
+                    #     siglip_weight = gr.Slider(0.0, 1.0, 1.0, step=0.05, label="Siglip Weight")
+                with gr.Row():
+                    with gr.Column():
+                        # Reference image inputs
+                        gr.Markdown("### Face References (1-4 required)")
+                        ref_img1 = gr.Image(label="Reference 1", type="pil")
+                        ref_img2 = gr.Image(label="Reference 2", type="pil", visible=True)
+                        ref_img3 = gr.Image(label="Reference 3", type="pil", visible=True)
+                        ref_img4 = gr.Image(label="Reference 4", type="pil", visible=True)
+                    with gr.Column():
+                        # Bounding box inputs
+                        gr.Markdown("### Mask Configuration (Option 1: Automatic)")
+                        multi_person_image = gr.Image(label="Multi-person image (for automatic bbox extraction)", type="pil")
+                        bbox_preview = gr.Image(label="Detected Faces", type="pil")
+                        gr.Markdown("### Mask Configuration (Option 2: Manual)")
+                        manual_bbox_input = gr.Textbox(
+                            label="Manual Bounding Boxes (one per line, format: x1,y1,x2,y2)",
+                            lines=4,
+                            placeholder="100,100,200,200\n300,100,400,200"
+                        )
+                # generate_btn = gr.Button("Generate", variant="primary")
+            with gr.Column():
+                # Output display
+                output_image = gr.Image(label="Generated Image")
+                debug_face = gr.Image(label="Debug. Faces are expected to be generated in these boxes")
+                download_btn = gr.File(label="Download full-resolution", type="filepath", interactive=False)
+        # Examples section
+        with gr.Row():
+            gr.Markdown("""
+            # Example Configurations
+            ### Tips for Better Results
+            Be prepared for the first few runs as it may not be very satisfying.
+            - Provide detailed prompts describing the identity. WithAnyone is "controllable", so it needs more information to be controlled. Here are something that might go wrong if not specified:
+                - Skin color (generally the race is fine, but for asain descent, if not specified, it may generate darker skin tone);
+                - Age (e.g., intead of "a man", try "a young man". If not specified, it may generate an older figure);
+                - Body build;
+                - Hairstyle;
+                - Accessories (glasses, hats, earrings, etc.);
+                - Makeup
+            - Use the slider to balance between "Resemblance in Spirit" and "Resemblance in Form" according to your needs. If you want to preserve more details in the reference image, move the slider to the right; if you want more freedom and creativity, move it to the left.
+            - Try it with LoRAs from community. They are usually fantastic.
+            """)
+        with gr.Row():
+            examples = gr.Examples(
+                examples=[
+                    [
+                        "a highly detailed portrait of a woman shown in profile. Her long, dark hair flows elegantly, intricately decorated with an abundant array of colorful flowers—ranging from soft light pinks and vibrant light oranges to delicate greyish blues—and lush green leaves, giving a sense of natural beauty and charm. Her bright blue eyes are striking, and her lips are painted a vivid red, adding to her alluring appearance. She is clad in an ornate garment with intricate floral patterns in warm hues like pink and orange, featuring exquisite detailing that speaks of fine craftsmanship. Around her neck, she wears a decorative choker with intricate designs, and dangling from her ears are beautiful blue teardrop earrings that catch the light. The background is filled with a profusion of flowers in various shades, creating a rich, vibrant, and romantic atmosphere that complements the woman's elegant and enchanting look.",  # prompt
+                        1024, 1024,  # width, height
+                        4.0, 25, 42,  # guidance, num_steps, seed
+                        "assets/ref1.jpg", None, None, None,  # ref images
+                        "240,180,540,500", None,  # manual_bbox_input, multi_person_image
+                        # True,  # use_text_prompt
+                        0.0,  # siglip_weight
+                    ],
+                    [
+                        "High resolution anfd extremely detailed image of two elegant ladies enjoying a serene afternoon in a quaint Parisian café. They both wear fashionable trench coats and stylish berets, exuding an air of sophistication. One lady gently sips on a cappuccino, while her companion reads an intriguing novel with a subtle smile. The café is framed by charming antique furniture and vintage posters adorning the walls. Soft, warm light filters through a window, casting delicate shadows and creating a cozy, inviting atmosphere. Captured from a slightly elevated angle, the composition highlights the warmth of the scene in a gentle watercolor illustrative style. ",  # prompt
+                        1024, 1024,  # width, height
+                        4.0, 25, 42,  # guidance, num_steps, seed
+                        "assets/ref1.jpg", "assets/ref2.jpg", None, None,  # ref images
+                        "248,172,428,498\n554,128,728,464", None,  # manual_bbox_input, multi_person_image
+                        # True,  # use_text_prompt
+                        0.0,  # siglip_weight
+                    ]
+                ],
+                inputs=[
+                    prompt, width, height, guidance, num_steps, seed,
+                    ref_img1, ref_img2, ref_img3, ref_img4,
+                    manual_bbox_input, multi_person_image,
+                    siglip_weight
+                ],
+                label="Click to load example configurations"
+            )
+        # Set up event handlers
+        multi_person_image.change(
+            fn=update_bbox_display,
+            inputs=[multi_person_image],
+            outputs=[bbox_preview, manual_bbox_input, bbox_preview]
+        )
+        generate_btn.click(
+            fn=process_and_generate,
+            inputs=[
+                prompt, width, height, guidance, num_steps, seed,
+                ref_img1, ref_img2, ref_img3, ref_img4,
+                manual_bbox_input, multi_person_image,
+                siglip_weight
+            ],
+            outputs=[output_image,debug_face, download_btn]
+        )
+    return demo
+if __name__ == "__main__":
+    from transformers import HfArgumentParser
+    @dataclasses.dataclass
+    class AppArgs:
+        model_type: Literal["flux-dev", "flux-dev-fp8", "flux-schnell"] = "flux-dev"
+        device: Literal["cuda", "cpu"] = (
+            "cuda" if torch.cuda.is_available()
+            else "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+            else "cpu"
+        )
+        offload: bool = False
+        lora_rank: int = 64
+        port: int = 7860
+        additional_lora: str = None
+        lora_scale: float = 1.0
+        ipa_path: str = "WithAnyone/WithAnyone"
+        clip_path: str = "openai/clip-vit-large-patch14"
+        t5_path: str = "xlabs-ai/xflux_text_encoders"
+        flux_path: str = "black-forest-labs/FLUX.1-dev"
+    parser = HfArgumentParser([AppArgs])
+    args = parser.parse_args_into_dataclasses()[0]
+    demo = create_demo(
+        args.model_type,
+        args.ipa_path,
+        args.device,
+        args.offload,
+        args.lora_rank,
+        args.additional_lora,
+        args.lora_scale,
+        args.clip_path,
+        args.t5_path,
+        args.flux_path,
+    )
+    demo.launch(server_port=args.port)

gradio_app.py ADDED Viewed

	@@ -0,0 +1,568 @@

+# Copyright (c) 2025 Fudan University. All rights reserved.
+import dataclasses
+import json
+import os
+from pathlib import Path
+from typing import List, Literal, Optional
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image, ImageDraw
+from withanyone.flux.pipeline import WithAnyonePipeline
+from util import extract_moref, face_preserving_resize
+import insightface
+def captioner(prompt: str, num_person = 1) -> List[List[float]]:
+    # use random choose for testing
+    # within 512
+    if num_person == 1:
+        bbox_choices = [
+            # expanded, centered and quadrant placements
+            [96, 96, 288, 288],
+            [128, 128, 320, 320],
+            [160, 96, 352, 288],
+            [96, 160, 288, 352],
+            [208, 96, 400, 288],
+            [96, 208, 288, 400],
+            [192, 160, 368, 336],
+            [64, 128, 224, 320],
+            [288, 128, 448, 320],
+            [128, 256, 320, 448],
+            [80, 80, 240, 272],
+            [196, 196, 380, 380],
+            # originals
+            [100, 100, 300, 300],
+            [150, 50, 450, 350],
+            [200, 100, 500, 400],
+            [250, 150, 512, 450],
+        ]
+        return [bbox_choices[np.random.randint(0, len(bbox_choices))]]
+    elif num_person == 2:
+        # realistic side-by-side rows (no vertical stacks or diagonals)
+        bbox_choices = [
+            [[64, 112, 224, 304], [288, 112, 448, 304]],
+            [[48, 128, 208, 320], [304, 128, 464, 320]],
+            [[32, 144, 192, 336], [320, 144, 480, 336]],
+            [[80, 96, 240, 288], [272, 96, 432, 288]],
+            [[80, 160, 240, 352], [272, 160, 432, 352]],
+            [[64, 128, 240, 336], [272, 144, 432, 320]],  # slight stagger, same row
+            [[96, 160, 256, 352], [288, 160, 448, 352]],
+            [[64, 192, 224, 384], [288, 192, 448, 384]],  # lower row
+            [[16, 128, 176, 320], [336, 128, 496, 320]],  # near edges
+            [[48, 120, 232, 328], [280, 120, 464, 328]],
+            [[96, 160, 240, 336], [272, 160, 416, 336]],  # tighter faces
+            [[72, 136, 232, 328], [280, 152, 440, 344]],  # small vertical offset
+            [[48, 120, 224, 344], [288, 144, 448, 336]],  # asymmetric sizes
+            [[80, 224, 240, 416], [272, 224, 432, 416]],  # bottom row
+            [[80, 64, 240, 256], [272, 64, 432, 256]],    # top row
+            [[96, 176, 256, 368], [288, 176, 448, 368]],
+        ]
+        return bbox_choices[np.random.randint(0, len(bbox_choices))]
+    elif num_person == 3:
+        # Non-overlapping 3-person layouts within 512x512
+        bbox_choices = [
+            [[20, 140, 150, 360], [180, 120, 330, 360], [360, 130, 500, 360]],
+            [[30, 100, 160, 300], [190, 90, 320, 290], [350, 110, 480, 310]],
+            [[40, 180, 150, 330], [200, 180, 310, 330], [360, 180, 470, 330]],
+            [[60, 120, 170, 300], [210, 110, 320, 290], [350, 140, 480, 320]],
+            [[50, 80, 170, 250], [200, 130, 320, 300], [350, 80, 480, 250]],
+            [[40, 260, 170, 480], [190, 60, 320, 240], [350, 260, 490, 480]],
+            [[30, 120, 150, 320], [200, 140, 320, 340], [360, 160, 500, 360]],
+            [[80, 140, 200, 300], [220, 80, 350, 260], [370, 160, 500, 320]],
+        ]
+        return bbox_choices[np.random.randint(0, len(bbox_choices))]
+    elif num_person == 4:
+        # Non-overlapping 4-person layouts within 512x512
+        bbox_choices = [
+            [[20, 100, 120, 240], [140, 100, 240, 240], [260, 100, 360, 240], [380, 100, 480, 240]],
+            [[40, 60, 200, 260], [220, 60, 380, 260], [40, 280, 200, 480], [220, 280, 380, 480]],
+            [[180, 30, 330, 170], [30, 220, 150, 380], [200, 220, 320, 380], [360, 220, 490, 380]],
+            [[30, 60, 140, 200], [370, 60, 480, 200], [30, 320, 140, 460], [370, 320, 480, 460]],
+            [[20, 120, 120, 380], [140, 100, 240, 360], [260, 120, 360, 380], [380, 100, 480, 360]],
+            [[30, 80, 150, 240], [180, 120, 300, 280], [330, 80, 450, 240], [200, 300, 320, 460]],
+            [[30, 140, 110, 330], [140, 140, 220, 330], [250, 140, 330, 330], [370, 140, 450, 330]],
+            [[40, 80, 150, 240], [40, 260, 150, 420], [200, 80, 310, 240], [370, 80, 480, 240]],
+        ]
+        return bbox_choices[np.random.randint(0, len(bbox_choices))]
+class FaceExtractor:
+    def __init__(self, model_path="./"):
+        self.model = insightface.app.FaceAnalysis(name="antelopev2", root="./")
+        self.model.prepare(ctx_id=0)
+    def extract(self, image: Image.Image):
+        """Extract single face and embedding from an image"""
+        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        res = self.model.get(image_np)
+        if len(res) == 0:
+            return None, None
+        res = res[0]
+        bbox = res["bbox"]
+        moref = extract_moref(image, {"bboxes": [bbox]}, 1)
+        return moref[0], res["embedding"]
+    def extract_refs(self, image: Image.Image):
+        """Extract multiple faces and embeddings from an image"""
+        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        res = self.model.get(image_np)
+        if len(res) == 0:
+            return None, None, None
+        ref_imgs = []
+        arcface_embeddings = []
+        bboxes = []
+        for r in res:
+            bbox = r["bbox"]
+            bboxes.append(bbox)
+            moref = extract_moref(image, {"bboxes": [bbox]}, 1)
+            ref_imgs.append(moref[0])
+            arcface_embeddings.append(r["embedding"])
+        # Convert bboxes to the correct format
+        new_img, new_bboxes = face_preserving_resize(image, bboxes, 512)
+        return ref_imgs, arcface_embeddings, new_bboxes, new_img
+def resize_bbox(bbox, ori_width, ori_height, new_width, new_height):
+    """Resize bounding box coordinates while preserving aspect ratio"""
+    x1, y1, x2, y2 = bbox
+    # Calculate scaling factors
+    width_scale = new_width / ori_width
+    height_scale = new_height / ori_height
+    # Use minimum scaling factor to preserve aspect ratio
+    min_scale = min(width_scale, height_scale)
+    # Calculate offsets for centering the scaled box
+    width_offset = (new_width - ori_width * min_scale) / 2
+    height_offset = (new_height - ori_height * min_scale) / 2
+    # Scale and adjust coordinates
+    new_x1 = int(x1 * min_scale + width_offset)
+    new_y1 = int(y1 * min_scale + height_offset)
+    new_x2 = int(x2 * min_scale + width_offset)
+    new_y2 = int(y2 * min_scale + height_offset)
+    return [new_x1, new_y1, new_x2, new_y2]
+def draw_bboxes_on_image(image, bboxes):
+    """Draw bounding boxes on image for visualization"""
+    if bboxes is None:
+        return image
+    # Create a copy to draw on
+    img_draw = image.copy()
+    draw = ImageDraw.Draw(img_draw)
+    # Draw each bbox with a different color
+    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]
+    for i, bbox in enumerate(bboxes):
+        color = colors[i % len(colors)]
+        x1, y1, x2, y2 = [int(coord) for coord in bbox]
+        # Draw rectangle
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
+        # Draw label
+        draw.text((x1, y1-15), f"Face {i+1}", fill=color)
+    return img_draw
+def create_demo(
+    model_type: str = "flux-dev",
+    ipa_path: str = "./ckpt/ipa.safetensors",
+    device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    offload: bool = False,
+    lora_rank: int = 64,
+    additional_lora_ckpt: Optional[str] = None,
+    lora_scale: float = 1.0,
+    clip_path: str = "openai/clip-vit-large-patch14",
+    t5_path: str = "xlabs-ai/xflux_text_encoders",
+    flux_path: str = "black-forest-labs/FLUX.1-dev",
+):
+    face_extractor = FaceExtractor()
+    # Initialize pipeline and face extractor
+    pipeline = WithAnyonePipeline(
+        model_type,
+        ipa_path,
+        device,
+        offload,
+        only_lora=True,
+        no_lora=True,
+        lora_rank=lora_rank,
+        additional_lora_ckpt=additional_lora_ckpt,
+        lora_weight=lora_scale,
+        face_extractor=face_extractor,
+        clip_path=clip_path,
+        t5_path=t5_path,
+        flux_path=flux_path,
+    )
+    # Add project badges
+    # badges_text = r"""
+    # <div style="text-align: center; display: flex; justify-content: left; gap: 5px;">
+    # <a href="https://github.com/bytedance/UNO"><img alt="Build" src="https://img.shields.io/github/stars/bytedance/UNO"></a>
+    # <a href="https://bytedance.github.io/UNO/"><img alt="Build" src="https://img.shields.io/badge/Project%20Page-UNO-yellow"></a>
+    # <a href="https://arxiv.org/abs/2504.02160"><img alt="Build" src="https://img.shields.io/badge/arXiv%20paper-UNO-b31b1b.svg"></a>
+    # </div>
+    # """.strip()
+    def parse_bboxes(bbox_text):
+        """Parse bounding box text input"""
+        if not bbox_text or bbox_text.strip() == "":
+            return None
+        try:
+            bboxes = []
+            lines = bbox_text.strip().split("\n")
+            for line in lines:
+                if not line.strip():
+                    continue
+                coords = [float(x) for x in line.strip().split(",")]
+                if len(coords) != 4:
+                    raise ValueError(f"Each bbox must have 4 coordinates (x1,y1,x2,y2), got: {line}")
+                bboxes.append(coords)
+            # print(f"\nParsed bboxes: {bboxes}\n")
+            return bboxes
+        except Exception as e:
+            raise gr.Error(f"Invalid bbox format: {e}")
+    def extract_from_multi_person(multi_person_image):
+        """Extract references and bboxes from a multi-person image"""
+        if multi_person_image is None:
+            return None, None, None, None
+        # Convert from numpy to PIL if needed
+        if isinstance(multi_person_image, np.ndarray):
+            multi_person_image = Image.fromarray(multi_person_image)
+        ref_imgs, arcface_embeddings, bboxes, new_img = face_extractor.extract_refs(multi_person_image)
+        if ref_imgs is None or len(ref_imgs) == 0:
+            raise gr.Error("No faces detected in the multi-person image")
+        # Limit to max 4 faces
+        ref_imgs = ref_imgs[:4]
+        arcface_embeddings = arcface_embeddings[:4]
+        bboxes = bboxes[:4]
+        # Create visualization with bboxes
+        viz_image = draw_bboxes_on_image(new_img, bboxes)
+        # Format bboxes as string for display
+        bbox_text = "\n".join([f"{bbox[0]:.1f},{bbox[1]:.1f},{bbox[2]:.1f},{bbox[3]:.1f}" for bbox in bboxes])
+        return ref_imgs, arcface_embeddings, bboxes, viz_image
+    def process_and_generate(
+        prompt,
+        width, height,
+        guidance, num_steps, seed,
+        ref_img1, ref_img2, ref_img3, ref_img4,
+        manual_bboxes_text,
+        multi_person_image,
+        # use_text_prompt,
+        # id_weight,
+        siglip_weight
+    ):
+        # Collect and validate reference images
+        ref_images = [img for img in [ref_img1, ref_img2, ref_img3, ref_img4] if img is not None]
+        if not ref_images:
+            raise gr.Error("At least one reference image is required")
+        # Process reference images to extract face and embeddings
+        ref_imgs = []
+        arcface_embeddings = []
+        # Modified bbox handling logic
+        if multi_person_image is not None:
+            # Extract from multi-person image mode
+            extracted_refs, extracted_embeddings, bboxes_, _ = extract_from_multi_person(multi_person_image)
+            if extracted_refs is None:
+                raise gr.Error("Failed to extract faces from the multi-person image")
+            print("bboxes from multi-person image:", bboxes_)
+            # need to resize bboxes from 512 512 to width height
+            bboxes_ = [resize_bbox(bbox, 512, 512, width, height) for bbox in bboxes_]
+        else:
+            # Parse manual bboxes
+            bboxes_ = parse_bboxes(manual_bboxes_text)
+            # If no manual bboxes provided, use automatic captioner
+            if bboxes_ is None:
+                print("No multi-person image or manual bboxes provided. Using automatic captioner.")
+                # Generate automatic bboxes based on image dimensions
+                bboxes__ = captioner(prompt, num_person=len(ref_images))
+                # resize to width height
+                bboxes_ = [resize_bbox(bbox, 512, 512, width, height) for bbox in bboxes__]
+                print("Automatically generated bboxes:", bboxes_)
+        bboxes = [bboxes_]  # 伪装batch输入
+        # else:
+            # Manual mode: process each reference image
+        for img in ref_images:
+            if isinstance(img, np.ndarray):
+                img = Image.fromarray(img)
+            ref_img, embedding = face_extractor.extract(img)
+            if ref_img is None or embedding is None:
+                raise gr.Error("Failed to extract face from one of the reference images")
+            ref_imgs.append(ref_img)
+            arcface_embeddings.append(embedding)
+        # pad arcface_embeddings to 4 if less than 4
+        # while len(arcface_embeddings) < 4:
+        #     arcface_embeddings.append(np.zeros_like(arcface_embeddings[0]))
+        if bboxes is None:
+            raise gr.Error("Either provide manual bboxes or a multi-person image for bbox extraction")
+        if len(bboxes[0]) != len(ref_imgs):
+            raise gr.Error(f"Number of bboxes ({len(bboxes[0])}) must match number of reference images ({len(ref_imgs)})")
+        # Convert arcface embeddings to tensor
+        arcface_embeddings = [torch.tensor(embedding) for embedding in arcface_embeddings]
+        arcface_embeddings = torch.stack(arcface_embeddings).to(device)
+        # Generate image
+        final_prompt = prompt
+        print(f"Generating image of size {width}x{height} with bboxes: {bboxes} ")
+        if seed < 0:
+            seed = np.random.randint(0, 1000000)
+        image_gen = pipeline(
+            prompt=final_prompt,
+            width=width,
+            height=height,
+            guidance=guidance,
+            num_steps=num_steps,
+            seed=seed if seed > 0 else None,
+            ref_imgs=ref_imgs,
+            arcface_embeddings=arcface_embeddings,
+            bboxes=bboxes,
+            id_weight = 1 - siglip_weight,
+            siglip_weight=siglip_weight,
+        )
+        # Save temp file for download
+        temp_path = "temp_generated.png"
+        image_gen.save(temp_path)
+        # draw bboxes on the generated image for debug
+        debug_face = draw_bboxes_on_image(image_gen, bboxes[0])
+        return image_gen, debug_face, temp_path
+    def update_bbox_display(multi_person_image):
+        if multi_person_image is None:
+            return None, gr.update(visible=True), gr.update(visible=False)
+        try:
+            _, _, _, viz_image = extract_from_multi_person(multi_person_image)
+            return viz_image, gr.update(visible=False), gr.update(visible=True)
+        except Exception as e:
+            return None, gr.update(visible=True), gr.update(visible=False)
+    # Create Gradio interface
+    with gr.Blocks() as demo:
+        gr.Markdown("# WithAnyone Demo")
+        # gr.Markdown(badges_text)
+        with gr.Row():
+            with gr.Column():
+                # Input controls
+                generate_btn = gr.Button("Generate", variant="primary")
+                with gr.Row():
+                    with gr.Column():
+                        siglip_weight = gr.Slider(0.0, 1.0, 1.0, step=0.05, label="Spiritual Resemblance <--> Formal Resemblance")
+                with gr.Row():
+                    prompt = gr.Textbox(label="Prompt", value="a person in a beautiful garden. High resolution, extremely detailed")
+                    # use_text_prompt = gr.Checkbox(label="Use text prompt", value=True)
+                with gr.Row():
+                    # Image generation settings
+                    with gr.Column():
+                        width = gr.Slider(512, 1024, 768, step=64, label="Generation Width")
+                        height = gr.Slider(512, 1024, 768, step=64, label="Generation Height")
+                with gr.Accordion("Advanced Options", open=False):
+                    with gr.Row():
+                        num_steps = gr.Slider(1, 50, 25, step=1, label="Number of steps")
+                        guidance = gr.Slider(1.0, 10.0, 4.0, step=0.1, label="Guidance")
+                        seed = gr.Number(-1, label="Seed (-1 for random)")
+                        # start_at = gr.Slider(0, 50, 0, step=1, label="Start Identity at Step")
+                        # end_at = gr.Number(-1, label="End Identity at Step (-1 for last)")
+                    # with gr.Row():
+                    #     # skip_every = gr.Number(-1, label="Skip Identity Every N Steps (-1 for no skip)")
+                    #     siglip_weight = gr.Slider(0.0, 1.0, 1.0, step=0.05, label="Siglip Weight")
+                with gr.Row():
+                    with gr.Column():
+                        # Reference image inputs
+                        gr.Markdown("### Face References (1-4 required)")
+                        ref_img1 = gr.Image(label="Reference 1", type="pil")
+                        ref_img2 = gr.Image(label="Reference 2", type="pil", visible=True)
+                        ref_img3 = gr.Image(label="Reference 3", type="pil", visible=True)
+                        ref_img4 = gr.Image(label="Reference 4", type="pil", visible=True)
+                    with gr.Column():
+                        # Bounding box inputs
+                        gr.Markdown("### Mask Configuration (Option 1: Automatic)")
+                        multi_person_image = gr.Image(label="Multi-person image (for automatic bbox extraction)", type="pil")
+                        bbox_preview = gr.Image(label="Detected Faces", type="pil")
+                        gr.Markdown("### Mask Configuration (Option 2: Manual)")
+                        manual_bbox_input = gr.Textbox(
+                            label="Manual Bounding Boxes (one per line, format: x1,y1,x2,y2)",
+                            lines=4,
+                            placeholder="100,100,200,200\n300,100,400,200"
+                        )
+                # generate_btn = gr.Button("Generate", variant="primary")
+            with gr.Column():
+                # Output display
+                output_image = gr.Image(label="Generated Image")
+                debug_face = gr.Image(label="Debug. Faces are expected to be generated in these boxes")
+                download_btn = gr.File(label="Download full-resolution", type="filepath", interactive=False)
+        # Examples section
+        with gr.Row():
+            gr.Markdown("""
+            # Example Configurations
+            ### Tips for Better Results
+            Be prepared for the first few runs as it may not be very satisfying.
+            - Provide detailed prompts describing the identity. WithAnyone is "controllable", so it needs more information to be controlled. Here are something that might go wrong if not specified:
+                - Skin color (generally the race is fine, but for asain descent, if not specified, it may generate darker skin tone);
+                - Age (e.g., intead of "a man", try "a young man". If not specified, it may generate an older figure);
+                - Body build;
+                - Hairstyle;
+                - Accessories (glasses, hats, earrings, etc.);
+                - Makeup
+            - Use the slider to balance between "Resemblance in Spirit" and "Resemblance in Form" according to your needs. If you want to preserve more details in the reference image, move the slider to the right; if you want more freedom and creativity, move it to the left.
+            - Try it with LoRAs from community. They are usually fantastic.
+            """)
+        with gr.Row():
+            examples = gr.Examples(
+                examples=[
+                    [
+                        "a highly detailed portrait of a woman shown in profile. Her long, dark hair flows elegantly, intricately decorated with an abundant array of colorful flowers—ranging from soft light pinks and vibrant light oranges to delicate greyish blues—and lush green leaves, giving a sense of natural beauty and charm. Her bright blue eyes are striking, and her lips are painted a vivid red, adding to her alluring appearance. She is clad in an ornate garment with intricate floral patterns in warm hues like pink and orange, featuring exquisite detailing that speaks of fine craftsmanship. Around her neck, she wears a decorative choker with intricate designs, and dangling from her ears are beautiful blue teardrop earrings that catch the light. The background is filled with a profusion of flowers in various shades, creating a rich, vibrant, and romantic atmosphere that complements the woman's elegant and enchanting look.",  # prompt
+                        1024, 1024,  # width, height
+                        4.0, 25, 42,  # guidance, num_steps, seed
+                        "assets/ref1.jpg", None, None, None,  # ref images
+                        "240,180,540,500", None,  # manual_bbox_input, multi_person_image
+                        # True,  # use_text_prompt
+                        0.0,  # siglip_weight
+                    ],
+                    [
+                        "High resolution anfd extremely detailed image of two elegant ladies enjoying a serene afternoon in a quaint Parisian café. They both wear fashionable trench coats and stylish berets, exuding an air of sophistication. One lady gently sips on a cappuccino, while her companion reads an intriguing novel with a subtle smile. The café is framed by charming antique furniture and vintage posters adorning the walls. Soft, warm light filters through a window, casting delicate shadows and creating a cozy, inviting atmosphere. Captured from a slightly elevated angle, the composition highlights the warmth of the scene in a gentle watercolor illustrative style. ",  # prompt
+                        1024, 1024,  # width, height
+                        4.0, 25, 42,  # guidance, num_steps, seed
+                        "assets/ref1.jpg", "assets/ref2.jpg", None, None,  # ref images
+                        "248,172,428,498\n554,128,728,464", None,  # manual_bbox_input, multi_person_image
+                        # True,  # use_text_prompt
+                        0.0,  # siglip_weight
+                    ]
+                ],
+                inputs=[
+                    prompt, width, height, guidance, num_steps, seed,
+                    ref_img1, ref_img2, ref_img3, ref_img4,
+                    manual_bbox_input, multi_person_image,
+                    siglip_weight
+                ],
+                label="Click to load example configurations"
+            )
+        # Set up event handlers
+        multi_person_image.change(
+            fn=update_bbox_display,
+            inputs=[multi_person_image],
+            outputs=[bbox_preview, manual_bbox_input, bbox_preview]
+        )
+        generate_btn.click(
+            fn=process_and_generate,
+            inputs=[
+                prompt, width, height, guidance, num_steps, seed,
+                ref_img1, ref_img2, ref_img3, ref_img4,
+                manual_bbox_input, multi_person_image,
+                siglip_weight
+            ],
+            outputs=[output_image,debug_face, download_btn]
+        )
+    return demo
+if __name__ == "__main__":
+    from transformers import HfArgumentParser
+    @dataclasses.dataclass
+    class AppArgs:
+        model_type: Literal["flux-dev", "flux-dev-fp8", "flux-schnell"] = "flux-dev"
+        device: Literal["cuda", "cpu"] = (
+            "cuda" if torch.cuda.is_available()
+            else "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+            else "cpu"
+        )
+        offload: bool = False
+        lora_rank: int = 64
+        port: int = 7860
+        additional_lora: str = None
+        lora_scale: float = 1.0
+        ipa_path: str = "./ckpt/ipa.safetensors"
+        clip_path: str = "openai/clip-vit-large-patch14"
+        t5_path: str = "xlabs-ai/xflux_text_encoders"
+        flux_path: str = "black-forest-labs/FLUX.1-dev"
+    parser = HfArgumentParser([AppArgs])
+    args = parser.parse_args_into_dataclasses()[0]
+    demo = create_demo(
+        args.model_type,
+        args.ipa_path,
+        args.device,
+        args.offload,
+        args.lora_rank,
+        args.additional_lora,
+        args.lora_scale,
+        args.clip_path,
+        args.t5_path,
+        args.flux_path,
+    )
+    demo.launch(server_port=args.port)

gradio_edit.py ADDED Viewed

	@@ -0,0 +1,563 @@

+# Copyright (c) 2025 Fudan University. All rights reserved.
+import dataclasses
+import json
+import os
+from pathlib import Path
+from typing import List, Literal, Optional, Tuple, Union
+from io import BytesIO
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFilter
+from PIL.JpegImagePlugin import JpegImageFile
+from withanyone_kontext_s.flux.pipeline import WithAnyonePipeline
+from util import extract_moref, face_preserving_resize
+import insightface
+def blur_faces_in_image(img, json_data, face_size_threshold=100, blur_radius=15):
+    """
+    Blurs facial areas directly in the original image for privacy protection.
+    Args:
+        img: PIL Image or image data
+        json_data: JSON object with 'bboxes' and 'crop' information
+        face_size_threshold: Minimum size for faces to be considered (default: 100 pixels)
+        blur_radius: Strength of the blur effect (higher = more blurred)
+    Returns:
+        PIL Image with faces blurred
+    """
+    # Ensure img is a PIL Image
+    if not isinstance(img, Image.Image) and not isinstance(img, torch.Tensor) and not isinstance(img, JpegImageFile):
+        img = Image.open(BytesIO(img))
+    new_bboxes = json_data['bboxes']
+    # crop = json_data['crop'] if 'crop' in json_data else [0, 0, img.width, img.height]
+    # # Recalculate bounding boxes based on crop info
+    # new_bboxes = [recalculate_bbox(bbox, crop) for bbox in bboxes]
+    # Check face sizes and filter out faces that are too small
+    valid_bboxes = []
+    for bbox in new_bboxes:
+        x1, y1, x2, y2 = bbox
+        if x2 - x1 >= face_size_threshold and y2 - y1 >= face_size_threshold:
+            valid_bboxes.append(bbox)
+    # If no valid faces found, return original image
+    if not valid_bboxes:
+        return img
+    # Create a copy of the original image to modify
+    blurred_img = img.copy()
+    # Process each face
+    for bbox in valid_bboxes:
+        # Convert coordinates to integers
+        x1, y1, x2, y2 = map(int, bbox)
+        # Ensure coordinates are within image boundaries
+        img_width, img_height = img.size
+        x1 = max(0, x1)
+        y1 = max(0, y1)
+        x2 = min(img_width, x2)
+        y2 = min(img_height, y2)
+        # Extract the face region
+        face_region = img.crop((x1, y1, x2, y2))
+        # Apply blur to the face region
+        blurred_face = face_region.filter(ImageFilter.GaussianBlur(radius=blur_radius))
+        # Paste the blurred face back into the image
+        blurred_img.paste(blurred_face, (x1, y1))
+    return blurred_img
+def captioner(prompt: str, num_person = 1) -> List[List[float]]:
+    # use random choose for testing
+    # within 512
+    if num_person == 1:
+        bbox_choices = [
+            # expanded, centered and quadrant placements
+            [96, 96, 288, 288],
+            [128, 128, 320, 320],
+            [160, 96, 352, 288],
+            [96, 160, 288, 352],
+            [208, 96, 400, 288],
+            [96, 208, 288, 400],
+            [192, 160, 368, 336],
+            [64, 128, 224, 320],
+            [288, 128, 448, 320],
+            [128, 256, 320, 448],
+            [80, 80, 240, 272],
+            [196, 196, 380, 380],
+            # originals
+            [100, 100, 300, 300],
+            [150, 50, 450, 350],
+            [200, 100, 500, 400],
+            [250, 150, 512, 450],
+        ]
+        return [bbox_choices[np.random.randint(0, len(bbox_choices))]]
+    elif num_person == 2:
+        # realistic side-by-side rows (no vertical stacks or diagonals)
+        bbox_choices = [
+            [[64, 112, 224, 304], [288, 112, 448, 304]],
+            [[48, 128, 208, 320], [304, 128, 464, 320]],
+            [[32, 144, 192, 336], [320, 144, 480, 336]],
+            [[80, 96, 240, 288], [272, 96, 432, 288]],
+            [[80, 160, 240, 352], [272, 160, 432, 352]],
+            [[64, 128, 240, 336], [272, 144, 432, 320]],  # slight stagger, same row
+            [[96, 160, 256, 352], [288, 160, 448, 352]],
+            [[64, 192, 224, 384], [288, 192, 448, 384]],  # lower row
+            [[16, 128, 176, 320], [336, 128, 496, 320]],  # near edges
+            [[48, 120, 232, 328], [280, 120, 464, 328]],
+            [[96, 160, 240, 336], [272, 160, 416, 336]],  # tighter faces
+            [[72, 136, 232, 328], [280, 152, 440, 344]],  # small vertical offset
+            [[48, 120, 224, 344], [288, 144, 448, 336]],  # asymmetric sizes
+            [[80, 224, 240, 416], [272, 224, 432, 416]],  # bottom row
+            [[80, 64, 240, 256], [272, 64, 432, 256]],    # top row
+            [[96, 176, 256, 368], [288, 176, 448, 368]],
+        ]
+        return bbox_choices[np.random.randint(0, len(bbox_choices))]
+    elif num_person == 3:
+        # Non-overlapping 3-person layouts within 512x512
+        bbox_choices = [
+            [[20, 140, 150, 360], [180, 120, 330, 360], [360, 130, 500, 360]],
+            [[30, 100, 160, 300], [190, 90, 320, 290], [350, 110, 480, 310]],
+            [[40, 180, 150, 330], [200, 180, 310, 330], [360, 180, 470, 330]],
+            [[60, 120, 170, 300], [210, 110, 320, 290], [350, 140, 480, 320]],
+            [[50, 80, 170, 250], [200, 130, 320, 300], [350, 80, 480, 250]],
+            [[40, 260, 170, 480], [190, 60, 320, 240], [350, 260, 490, 480]],
+            [[30, 120, 150, 320], [200, 140, 320, 340], [360, 160, 500, 360]],
+            [[80, 140, 200, 300], [220, 80, 350, 260], [370, 160, 500, 320]],
+        ]
+        return bbox_choices[np.random.randint(0, len(bbox_choices))]
+    elif num_person == 4:
+        # Non-overlapping 4-person layouts within 512x512
+        bbox_choices = [
+            [[20, 100, 120, 240], [140, 100, 240, 240], [260, 100, 360, 240], [380, 100, 480, 240]],
+            [[40, 60, 200, 260], [220, 60, 380, 260], [40, 280, 200, 480], [220, 280, 380, 480]],
+            [[180, 30, 330, 170], [30, 220, 150, 380], [200, 220, 320, 380], [360, 220, 490, 380]],
+            [[30, 60, 140, 200], [370, 60, 480, 200], [30, 320, 140, 460], [370, 320, 480, 460]],
+            [[20, 120, 120, 380], [140, 100, 240, 360], [260, 120, 360, 380], [380, 100, 480, 360]],
+            [[30, 80, 150, 240], [180, 120, 300, 280], [330, 80, 450, 240], [200, 300, 320, 460]],
+            [[30, 140, 110, 330], [140, 140, 220, 330], [250, 140, 330, 330], [370, 140, 450, 330]],
+            [[40, 80, 150, 240], [40, 260, 150, 420], [200, 80, 310, 240], [370, 80, 480, 240]],
+        ]
+        return bbox_choices[np.random.randint(0, len(bbox_choices))]
+class FaceExtractor:
+    def __init__(self, model_path="./"):
+        self.model = insightface.app.FaceAnalysis(name="antelopev2", root="./")
+        self.model.prepare(ctx_id=0)
+    def extract(self, image: Image.Image):
+        """Extract single face and embedding from an image"""
+        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        res = self.model.get(image_np)
+        if len(res) == 0:
+            return None, None
+        res = res[0]
+        bbox = res["bbox"]
+        moref = extract_moref(image, {"bboxes": [bbox]}, 1)
+        return moref[0], res["embedding"]
+    def extract_refs(self, image: Image.Image):
+        """Extract multiple faces and embeddings from an image"""
+        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        res = self.model.get(image_np)
+        if len(res) == 0:
+            return None, None, None, None
+        ref_imgs = []
+        arcface_embeddings = []
+        bboxes = []
+        for r in res:
+            bbox = r["bbox"]
+            bboxes.append(bbox)
+            moref = extract_moref(image, {"bboxes": [bbox]}, 1)
+            ref_imgs.append(moref[0])
+            arcface_embeddings.append(r["embedding"])
+        # Convert bboxes to the correct format
+        new_img, new_bboxes = face_preserving_resize(image, bboxes, 512)
+        return ref_imgs, arcface_embeddings, new_bboxes, new_img
+def resize_bbox(bbox, ori_width, ori_height, new_width, new_height):
+    """Resize bounding box coordinates while preserving aspect ratio"""
+    x1, y1, x2, y2 = bbox
+    # Calculate scaling factors
+    width_scale = new_width / ori_width
+    height_scale = new_height / ori_height
+    # Use minimum scaling factor to preserve aspect ratio
+    min_scale = min(width_scale, height_scale)
+    # Calculate offsets for centering the scaled box
+    width_offset = (new_width - ori_width * min_scale) / 2
+    height_offset = (new_height - ori_height * min_scale) / 2
+    # Scale and adjust coordinates
+    new_x1 = int(x1 * min_scale + width_offset)
+    new_y1 = int(y1 * min_scale + height_offset)
+    new_x2 = int(x2 * min_scale + width_offset)
+    new_y2 = int(y2 * min_scale + height_offset)
+    return [new_x1, new_y1, new_x2, new_y2]
+def draw_bboxes_on_image(image, bboxes):
+    """Draw bounding boxes on image for visualization"""
+    if bboxes is None:
+        return image
+    # Create a copy to draw on
+    img_draw = image.copy()
+    draw = ImageDraw.Draw(img_draw)
+    # Draw each bbox with a different color
+    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]
+    for i, bbox in enumerate(bboxes):
+        color = colors[i % len(colors)]
+        x1, y1, x2, y2 = [int(coord) for coord in bbox]
+        # Draw rectangle
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
+        # Draw label
+        draw.text((x1, y1-15), f"Face {i+1}", fill=color)
+    return img_draw
+def create_demo(
+    model_type: str = "flux-dev",
+    ipa_path: str = "./ckpt/ipa.safetensors",
+    device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    offload: bool = False,
+    lora_rank: int = 64,
+    additional_lora_ckpt: Optional[str] = None,
+    lora_scale: float = 1.0,
+    clip_path: str = "openai/clip-vit-large-patch14",
+    t5_path: str = "xlabs-ai/xflux_text_encoders",
+    flux_path: str = "black-forest-labs/FLUX.1-dev",
+):
+    face_extractor = FaceExtractor()
+    # Initialize pipeline and face extractor
+    pipeline = WithAnyonePipeline(
+        model_type,
+        ipa_path,
+        device,
+        offload,
+        only_lora=True,
+        no_lora=True,
+        lora_rank=lora_rank,
+        additional_lora_ckpt=additional_lora_ckpt,
+        lora_weight=lora_scale,
+        face_extractor=face_extractor,
+        clip_path=clip_path,
+        t5_path=t5_path,
+        flux_path=flux_path,
+    )
+    def parse_bboxes(bbox_text):
+        """Parse bounding box text input"""
+        if not bbox_text or bbox_text.strip() == "":
+            return None
+        try:
+            bboxes = []
+            lines = bbox_text.strip().split("\n")
+            for line in lines:
+                if not line.strip():
+                    continue
+                coords = [float(x) for x in line.strip().split(",")]
+                if len(coords) != 4:
+                    raise ValueError(f"Each bbox must have 4 coordinates (x1,y1,x2,y2), got: {line}")
+                bboxes.append(coords)
+            return bboxes
+        except Exception as e:
+            raise gr.Error(f"Invalid bbox format: {e}")
+    def extract_from_base_image(base_img):
+        """Extract references and bboxes from the base image"""
+        if base_img is None:
+            return None, None, None, None
+        # Convert from numpy to PIL if needed
+        if isinstance(base_img, np.ndarray):
+            base_img = Image.fromarray(base_img)
+        ref_imgs, arcface_embeddings, bboxes, new_img = face_extractor.extract_refs(base_img)
+        if ref_imgs is None or len(ref_imgs) == 0:
+            raise gr.Error("No faces detected in the base image")
+        # Limit to max 4 faces
+        ref_imgs = ref_imgs[:4]
+        arcface_embeddings = arcface_embeddings[:4]
+        bboxes = bboxes[:4]
+        # Create visualization with bboxes
+        viz_image = draw_bboxes_on_image(new_img, bboxes)
+        # Format bboxes as string for display
+        bbox_text = "\n".join([f"{bbox[0]:.1f},{bbox[1]:.1f},{bbox[2]:.1f},{bbox[3]:.1f}" for bbox in bboxes])
+        return ref_imgs, arcface_embeddings, bboxes, viz_image, bbox_text
+    def process_and_generate(
+        prompt,
+        guidance, num_steps, seed,
+        ref_img1, ref_img2, ref_img3, ref_img4,
+        base_img,
+        manual_bboxes_text,
+        use_text_prompt,
+        siglip_weight
+    ):
+        # Validate base_img is provided
+        if base_img is None:
+            raise gr.Error("Base image is required")
+        # Convert numpy to PIL if needed
+        if isinstance(base_img, np.ndarray):
+            base_img = Image.fromarray(base_img)
+        # Get dimensions from base_img
+        width, height = base_img.size
+        # Collect and validate reference images
+        ref_images = [img for img in [ref_img1, ref_img2, ref_img3, ref_img4] if img is not None]
+        if not ref_images:
+            raise gr.Error("At least one reference image is required")
+        # Process reference images to extract face and embeddings
+        ref_imgs = []
+        arcface_embeddings = []
+        # Extract bboxes from the base image
+        extracted_refs, extracted_embeddings, bboxes_, _, _ = extract_from_base_image(base_img)
+        bboxes__ = [resize_bbox(bbox, 512, 512, width, height) for bbox in bboxes_]
+        if extracted_refs is None:
+            raise gr.Error("No faces detected in the base image. Please provide a different base image with clear faces.")
+        # Create blurred canvas by blurring faces in the base image
+        blurred_canvas = blur_faces_in_image(base_img, {'bboxes': bboxes__})
+        bboxes = [bboxes__]  # Wrap in list for batch input format
+        # Process each reference image
+        for img in ref_images:
+            if isinstance(img, np.ndarray):
+                img = Image.fromarray(img)
+            ref_img, embedding = face_extractor.extract(img)
+            if ref_img is None or embedding is None:
+                raise gr.Error("Failed to extract face from one of the reference images")
+            ref_imgs.append(ref_img)
+            arcface_embeddings.append(embedding)
+        if len(bboxes[0]) != len(ref_imgs):
+            raise gr.Error(f"Number of bboxes ({len(bboxes[0])}) must match number of reference images ({len(ref_imgs)})")
+        # Convert arcface embeddings to tensor
+        arcface_embeddings = [torch.tensor(embedding) for embedding in arcface_embeddings]
+        arcface_embeddings = torch.stack(arcface_embeddings).to(device)
+        # Generate image
+        final_prompt = prompt if use_text_prompt else ""
+        if seed < 0:
+            seed = np.random.randint(0, 1000000)
+        image_gen = pipeline(
+            prompt=final_prompt,
+            width=width,
+            height=height,
+            guidance=guidance,
+            num_steps=num_steps,
+            seed=seed if seed > 0 else None,
+            ref_imgs=ref_imgs,
+            img_cond=blurred_canvas,  # Pass the blurred canvas image
+            arcface_embeddings=arcface_embeddings,
+            bboxes=bboxes,
+            max_num_ids=len(ref_imgs),
+            siglip_weight=0,
+            id_weight=1, # only arcface supported now
+            arc_only=True,
+        )
+        # Save temp file for download
+        temp_path = "temp_generated.png"
+        image_gen.save(temp_path)
+        # draw bboxes on the generated image for debug
+        debug_face = draw_bboxes_on_image(image_gen, bboxes[0])
+        return image_gen, debug_face, temp_path
+    def update_bbox_display(base_img):
+        if base_img is None:
+            return None, None
+        try:
+            _, _, _, viz_image, bbox_text = extract_from_base_image(base_img)
+            return viz_image, bbox_text
+        except Exception as e:
+            return None, None
+    # Create Gradio interface
+    with gr.Blocks() as demo:
+        gr.Markdown("# WithAnyone Kontext Demo")
+        with gr.Row():
+            with gr.Column():
+                # Input controls
+                generate_btn = gr.Button("Generate", variant="primary")
+                siglip_weight = 0.0
+                with gr.Row():
+                    prompt = gr.Textbox(label="Prompt", value="a person in a beautiful garden. High resolution, extremely detailed")
+                    use_text_prompt = gr.Checkbox(label="Use text prompt", value=True)
+                with gr.Accordion("Advanced Options", open=False):
+                    with gr.Row():
+                        num_steps = gr.Slider(1, 50, 25, step=1, label="Number of steps")
+                        guidance = gr.Slider(1.0, 10.0, 4.0, step=0.1, label="Guidance")
+                        seed = gr.Number(-1, label="Seed (-1 for random)")
+                with gr.Row():
+                    with gr.Column():
+                        # Reference image inputs
+                        gr.Markdown("### Face References (1-4 required)")
+                        ref_img1 = gr.Image(label="Reference 1", type="pil")
+                        ref_img2 = gr.Image(label="Reference 2", type="pil", visible=True)
+                        ref_img3 = gr.Image(label="Reference 3", type="pil", visible=True)
+                        ref_img4 = gr.Image(label="Reference 4", type="pil", visible=True)
+                    with gr.Column():
+                        # Base image input - combines the previous canvas and multi-person image
+                        gr.Markdown("### Base Image (Required)")
+                        base_img = gr.Image(label="Base Image - faces will be detected and replaced", type="pil")
+                        bbox_preview = gr.Image(label="Detected Faces", type="pil")
+                        gr.Markdown("### Manual Bounding Box Override (Optional)")
+                        manual_bbox_input = gr.Textbox(
+                            label="Manual Bounding Boxes (one per line, format: x1,y1,x2,y2)",
+                            lines=4,
+                            placeholder="100,100,200,200\n300,100,400,200"
+                        )
+            with gr.Column():
+                # Output display
+                output_image = gr.Image(label="Generated Image")
+                debug_face = gr.Image(label="Debug: Faces are expected to be generated in these boxes")
+                download_btn = gr.File(label="Download full-resolution", type="filepath", interactive=False)
+        # Examples section
+        with gr.Row():
+            gr.Markdown("""
+            # Example Configurations
+            ### Tips for Better Results
+            - Base image is required - faces in this image will be detected, blurred, and then replaced
+            - Provide clear reference images with visible faces
+            - Use detailed prompts describing the desired output
+            - Adjust the resemblance slider based on your needs - more to the right for closer facial resemblance
+            """)
+        with gr.Row():
+            examples = gr.Examples(
+                examples=[
+                    [
+                        "",  # prompt
+                        4.0, 25, 42,  # guidance, num_steps, seed
+                        "assets/ref3.jpg", "assets/ref1.jpg", None, None,  # ref images
+                        "assets/canvas.jpg",  # base image
+                        False,  # use_text_prompt
+                    ]
+                ],
+                inputs=[
+                    prompt, guidance, num_steps, seed,
+                    ref_img1, ref_img2, ref_img3, ref_img4,
+                    base_img, use_text_prompt
+                ],
+                label="Click to load example configurations"
+            )
+        # Set up event handlers
+        base_img.change(
+            fn=update_bbox_display,
+            inputs=[base_img],
+            outputs=[bbox_preview, manual_bbox_input]
+        )
+        generate_btn.click(
+            fn=process_and_generate,
+            inputs=[
+                prompt, guidance, num_steps, seed,
+                ref_img1, ref_img2, ref_img3, ref_img4,
+                base_img, use_text_prompt,
+            ],
+            outputs=[output_image, debug_face, download_btn]
+        )
+    return demo
+if __name__ == "__main__":
+    from transformers import HfArgumentParser
+    @dataclasses.dataclass
+    class AppArgs:
+        model_type: Literal["flux-dev", "flux-kontext", "flux-schnell"] = "flux-kontext"
+        device: Literal["cuda", "cpu"] = (
+            "cuda" if torch.cuda.is_available()
+            else "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+            else "cpu"
+        )
+        offload: bool = False
+        lora_rank: int = 64
+        port: int = 7860
+        additional_lora: str = None
+        lora_scale: float = 1.0
+        ipa_path: str = "./ckpt/ipa.safetensors"
+        clip_path: str = "openai/clip-vit-large-patch14"
+        t5_path: str = "xlabs-ai/xflux_text_encoders"
+        flux_path: str = "black-forest-labs/FLUX.1-dev"
+    parser = HfArgumentParser([AppArgs])
+    args = parser.parse_args_into_dataclasses()[0]
+    demo = create_demo(
+        args.model_type,
+        args.ipa_path,
+        args.device,
+        args.offload,
+        args.lora_rank,
+        args.additional_lora,
+        args.lora_scale,
+        args.clip_path,
+        args.t5_path,
+        args.flux_path,
+    )
+    demo.launch(server_port=args.port)

infer_withanyone.py ADDED Viewed

	@@ -0,0 +1,309 @@

+# Copyright (c) 2025 Fudan University. All rights reserved.
+import os
+import dataclasses
+from typing import Literal
+from accelerate import Accelerator
+from transformers import HfArgumentParser
+from PIL import Image
+import json
+import itertools
+from withanyone.flux.pipeline import WithAnyonePipeline
+from util import extract_moref, general_face_preserving_resize, horizontal_concat, extract_object, FaceExtractor
+import numpy as np
+import random
+import torch
+from transformers import AutoModelForImageSegmentation
+from torch.cuda.amp import autocast
+BACK_UP_BBOXES_DOUBLE = [
+    [[100,100,200,200], [300,100,400,200]], # 2 faces
+    [[150,100,250,200], [300,100,400,200]]
+]
+BACK_UP_BBOXES = [ # for single face
+    [[150,100,250,200]],
+    [[100,100,200,200]],
+    [[200,100,300,200]],
+    [[250,100,350,200]],
+    [[300,100,400,200]],
+]
+@dataclasses.dataclass
+class InferenceArgs:
+    prompt: str | None = None
+    image_paths: list[str] | None = None
+    eval_json_path: str | None = None
+    offload: bool = False
+    num_images_per_prompt: int = 1
+    model_type: Literal["flux-dev", "flux-dev-fp8", "flux-schnell"] = "flux-dev"
+    width: int = 512
+    height: int = 512
+    ref_size: int = -1
+    num_steps: int = 25
+    guidance: float = 4
+    seed: int = 1234
+    save_path: str = "output/inference"
+    only_lora: bool = True
+    concat_refs: bool = False
+    lora_rank: int = 64
+    data_resolution: int = 512
+    save_iter: str = "500"
+    use_rec: bool = False
+    drop_text: bool = False
+    use_matting: bool = False
+    id_weight: float = 1.0
+    siglip_weight: float = 1.0
+    bbox_from_json: bool = True
+    data_root: str = "./"
+    # for lora
+    additional_lora: str | None = None
+    trigger: str = ""
+    lora_weight: float = 1.0
+    # path to the ipa model
+    ipa_path: str = "./ckpt/ipa.safetensors"
+    clip_path: str = "openai/clip-vit-large-patch14"
+    t5_path: str = "xlabs-ai/xflux_text_encoders"
+    flux_path: str = "black-forest-labs/FLUX.1-dev"
+    siglip_path: str = "google/siglip-base-patch16-256-i18n"
+def main(args: InferenceArgs):
+    accelerator = Accelerator()
+    face_extractor = FaceExtractor()
+    pipeline = WithAnyonePipeline(
+        args.model_type,
+        args.ipa_path,
+        accelerator.device,
+        args.offload,
+        only_lora=args.only_lora,
+        face_extractor=face_extractor,
+        additional_lora_ckpt=args.additional_lora,
+        lora_weight=args.lora_weight,
+        clip_path=args.clip_path,
+        t5_path=args.t5_path,
+        flux_path=args.flux_path,
+        siglip_path=args.siglip_path,
+    )
+    if args.use_matting:
+        birefnet = AutoModelForImageSegmentation.from_pretrained('briaai/RMBG-2.0', trust_remote_code=True).to('cuda', dtype=torch.bfloat16)
+    assert args.prompt is not None or args.eval_json_path is not None, \
+        "Please provide either prompt or eval_json_path"
+    # if args.eval_json_path is not None:
+    assert args.eval_json_path is not None, "Please provide eval_json_path. This script only supports batch inference."
+    with open(args.eval_json_path, "rt") as f:
+        data_dicts = json.load(f)
+    data_root = args.data_root
+    metadata = {}
+    for (i, data_dict), j in itertools.product(enumerate(data_dicts), range(args.num_images_per_prompt)):
+        if (i * args.num_images_per_prompt + j) % accelerator.num_processes != accelerator.process_index:
+            continue
+        # check if exist, if this image is already generated, skip it
+        # if any of the images are None, skip this image
+        if not os.path.exists(os.path.join(data_root, data_dict["image_paths"][0])):
+            print(f"Image {i} does not exist, skipping...")
+            print("path:", os.path.join(data_root, data_dict["image_paths"][0]))
+            continue
+        # extract bbox
+        ori_img_path = data_dict.get("ori_img_path", None)
+        # ori_img = Image.open(os.path.join(data_root, data_dict["ori_img_path"]))
+        # basename = data_dict["ori_img_path"].split(".")[0].replace("/", "_")
+        if ori_img_path is None:
+            basename = f"{i}_{j}"
+        else:
+            basename = data_dict["ori_img_path"].split(".")[0].replace("/", "_")
+            ori_img = Image.open(os.path.join(data_root, ori_img_path))
+        bboxes = None
+        print("Processing image", i, basename)
+        if not args.bbox_from_json:
+            if ori_img_path is None:
+                print(f"Image {i} has no ori_img_path, cannot extract bbox, skipping...")
+                continue
+            ori_img = Image.open(os.path.join(data_root, ori_img_path))
+            bboxes = face_extractor.locate_bboxes(ori_img)
+            # cut bbox length to num of imgae_paths
+            if bboxes is not None and len(bboxes) > len(data_dict["image_paths"]):
+                bboxes = bboxes[:len(data_dict["image_paths"])]
+            elif bboxes is not None and len(bboxes) < len(data_dict["image_paths"]):
+                print(f"Image {i} has less faces than image_paths, continuing...")
+                continue
+        else:
+            json_file_path = os.path.join(data_root, basename + ".json")
+            if os.path.exists(json_file_path):
+                with open(json_file_path, "r") as f:
+                    json_data = json.load(f)
+                old_bboxes = json_data.get("bboxes", None)
+                if old_bboxes is None:
+                    print(f"Image {i} has no bboxes in json file, using backup bboxes...")
+                    # v202 -> 2 faces v200_single -> 1 face
+                    if "v202" in args.eval_json_path:
+                        old_bboxes = random.choice(BACK_UP_BBOXES_DOUBLE)
+                    elif "v200_single" in args.eval_json_path:
+                        old_bboxes = random.choice(BACK_UP_BBOXES)
+                def recalculate_bbox( bbox, crop):
+                    """
+                    The image is cropped, so we need to recalculate the bbox.
+                    bbox: [x1, y1, x2, y2]
+                    crop: [x1c, y1c, x2c, y2c]
+                    we just need to minus x1c and y1c from x1, y1,
+                    """
+                    x1, y1, x2, y2 = bbox
+                    x1c, y1c, x2c, y2c = crop
+                    return [x1-x1c, y1-y1c, x2-x1c, y2-y1c]
+                crop = json_data.get("crop", None)
+                rec_bboxes = [
+                    recalculate_bbox(bbox, crop) if crop is not None else bbox for bbox in old_bboxes]
+                # face_preserving_resize(image, bboxes, 512)
+                if ori_img_path is not None:
+                    _, bboxes = general_face_preserving_resize(ori_img, rec_bboxes, 512)
+                # else we consider the provided bbox is already in target size
+                else:
+                    bboxes = rec_bboxes
+        if bboxes is None:
+            print(f"Image {i} has no face, bboxes are None, using backup bboxes..., basename: {basename}")
+            bboxes = random.choice(BACK_UP_BBOXES_DOUBLE)
+            print(f"Use backup bboxes: {bboxes}")
+        ref_imgs = []
+        arcface_embeddings = []
+        if not args.use_rec:
+            break_flag = False
+            for img_path in data_dict["image_paths"]:
+                img = Image.open(os.path.join(data_root, img_path))
+                ref_img, arcface_embedding = face_extractor.extract(img)
+                if ref_img is not None and arcface_embedding is not None:
+                    if args.use_matting:
+                        ref_img, _ = extract_object(birefnet, ref_img)
+                    ref_imgs.append(ref_img)
+                    arcface_embeddings.append(arcface_embedding)
+                else:
+                    print(f"Image {i} has no face, skipping...")
+                    break_flag = True
+                    break
+            if break_flag:
+                continue
+        else:
+            ref_imgs, arcface_embeddings = face_extractor.extract_refs(ori_img)
+            if ref_imgs is None or arcface_embeddings is None:
+                print(f"Image {i} has no face, skipping...")
+                continue
+            if args.use_matting:
+                ref_imgs = [extract_object(birefnet, ref_img)[0] for ref_img in ref_imgs]
+        # arcface to tensor
+        arcface_embeddings = [torch.tensor(arcface_embedding) for arcface_embedding in arcface_embeddings]
+        arcface_embeddings = torch.stack(arcface_embeddings).to(accelerator.device)
+        # check, if any of the images are None, if so, skip this image
+        if any(ref_img is None for ref_img in ref_imgs):
+            print(f"Image {i}: failed to extract face, skipping...")
+            continue
+        if args.ref_size==-1:
+            args.ref_size = 512 if len(ref_imgs)==1 else 320
+        if args.trigger != "" and args.trigger is not None:
+            data_dict["prompt"] = args.trigger + " " + data_dict["prompt"]
+        image_gen = pipeline(
+            prompt=data_dict["prompt"] if not args.drop_text else "",
+            width=args.width,
+            height=args.height,
+            guidance=args.guidance,
+            num_steps=args.num_steps,
+            seed=args.seed,
+            ref_imgs=ref_imgs,
+            arcface_embeddings=arcface_embeddings,
+            bboxes=[bboxes],
+            id_weight=args.id_weight,
+            siglip_weight=args.siglip_weight,
+        )
+        if args.concat_refs:
+            image_gen = horizontal_concat([image_gen, *ref_imgs])
+        os.makedirs(args.save_path, exist_ok=True)
+        save_path = os.path.join(args.save_path, basename)
+        os.makedirs(os.path.join(args.save_path, basename), exist_ok=True)
+        # save refs, image_gen and original image
+        for k, ref_img in enumerate(ref_imgs):
+            ref_img.save(os.path.join(save_path, f"ref_{k}.jpg"))
+        image_gen.save(os.path.join(save_path, f"out.jpg"))
+        # original image
+        ori_img = Image.open(os.path.join(data_root, data_dict["ori_img_path"])) if "ori_img_path" in data_dict else None
+        if ori_img is not None:
+            ori_img.save(os.path.join(save_path, f"ori.jpg"))
+        # save config
+        args_dict = vars(args)
+        args_dict['prompt'] = data_dict["prompt"]
+        args_dict["name"] = data_dict["name"] if "name" in data_dict else None
+        json.dump(args_dict, open(os.path.join(save_path, f"meta.json"), 'w'), indent=4, ensure_ascii=False)
+if __name__ == "__main__":
+    parser = HfArgumentParser([InferenceArgs])
+    args = parser.parse_args_into_dataclasses()[0]
+    main(args)

nohup.out ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Only 2 GPUs available, exiting.
2	+ Only 2 GPUs available, exiting.

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+accelerate==1.6.0
+einops
+gradio
+huggingface_hub
+insightface
+matplotlib
+numpy
+opencv-python
+opencv-python-headless
+optimum
+optimum_quanto
+Pillow
+PyYAML
+PyYAML
+safetensors
+seaborn
+scikit-image
+torch==2.5.1
+torchvision==0.20.1
+tqdm
+transformers==4.45.2
+onnxruntime
+onnxruntime-gpu
+sentencepiece

util.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright (c) 2025 Fudan University. All rights reserved.
+from io import BytesIO
+import random
+from PIL import Image
+import numpy as np
+import cv2
+import insightface
+import torch
+from torchvision import transforms
+from torch.cuda.amp import autocast
+def face_preserving_resize(img, face_bboxes, target_size=512):
+    """
+    Resize image while ensuring all faces are preserved in the output.
+    Args:
+        img: PIL Image
+        face_bboxes: List of [x1, y1, x2, y2] face coordinates
+        target_size: Maximum dimension for resizing
+    Returns:
+        Tuple of (resized image, new_bboxes) or (None, None) if faces can't fit
+    """
+    x1_1, y1_1, x2_1, y2_1 = map(int, face_bboxes[0])
+    x1_2, y1_2, x2_2, y2_2 = map(int, face_bboxes[1])
+    min_x1 = min(x1_1, x1_2)
+    min_y1 = min(y1_1, y1_2)
+    max_x2 = max(x2_1, x2_2)
+    max_y2 = max(y2_1, y2_2)
+    # print("min_x1:", min_x1, "min_y1:", min_y1, "max_x2:", max_x2, "max_y2:", max_y2)
+    # if any of them is negative, we cannot resize (Idk why this happens)
+    if min_x1 < 0 or min_y1 < 0 or max_x2 < 0 or max_y2 < 0:
+        return None, None
+    # if face width is longer than the image height, or the face height is longer than the image width, we cannot resize
+    face_width = max_x2 - min_x1
+    face_height = max_y2 - min_y1
+    if face_width > img.height or face_height > img.width:
+        return None, None
+    # Create a copy of face_bboxes for transformation
+    new_bboxes = []
+    for bbox in face_bboxes:
+        new_bboxes.append(list(map(int, bbox)))
+    # Choose cropping strategy based on image aspect ratio
+    if img.width > img.height:
+        # We need to crop width to make a square
+        square_size = img.height
+        # Calculate valid horizontal crop range that preserves all faces
+        left_max = min_x1  # Leftmost position that includes leftmost face
+        right_min = max_x2 - square_size  # Rightmost position that includes rightmost face
+        if right_min <= left_max:
+            # We can find a valid crop window
+            start = random.randint(int(right_min), int(left_max)) if right_min < left_max else int(right_min)
+            start = max(0, min(start, img.width - square_size))  # Ensure within image bounds
+        else:
+            # Faces are too far apart for square crop - use center of faces
+            face_center = (min_x1 + max_x2) // 2
+            start = max(0, min(face_center - (square_size // 2), img.width - square_size))
+        cropped_img = img.crop((start, 0, start + square_size, square_size))
+        # Adjust bounding box coordinates based on crop
+        for bbox in new_bboxes:
+            bbox[0] -= start  # x1 adjustment
+            bbox[2] -= start  # x2 adjustment
+            # y coordinates remain unchanged
+    else:
+        # We need to crop height to make a square
+        square_size = img.width
+        # Calculate valid vertical crop range that preserves all faces
+        top_max = min_y1  # Topmost position that includes topmost face
+        bottom_min = max_y2 - square_size  # Bottommost position that includes bottommost face
+        if bottom_min <= top_max:
+            # We can find a valid crop window
+            start = random.randint(int(bottom_min), int(top_max)) if bottom_min < top_max else int(bottom_min)
+            start = max(0, min(start, img.height - square_size))  # Ensure within image bounds
+        else:
+            # Faces are too far apart for square crop - use center of faces
+            face_center = (min_y1 + max_y2) // 2
+            start = max(0, min(face_center - (square_size // 2), img.height - square_size))
+        cropped_img = img.crop((0, start, square_size, start + square_size))
+        # Adjust bounding box coordinates based on crop
+        for bbox in new_bboxes:
+            bbox[1] -= start  # y1 adjustment
+            bbox[3] -= start  # y2 adjustment
+            # x coordinates remain unchanged
+    # Calculate scale factor for resizing from square_size to target_size
+    scale_factor = target_size / square_size
+    # Adjust bounding boxes for the resize operation
+    for bbox in new_bboxes:
+        bbox[0] = int(bbox[0] * scale_factor)
+        bbox[1] = int(bbox[1] * scale_factor)
+        bbox[2] = int(bbox[2] * scale_factor)
+        bbox[3] = int(bbox[3] * scale_factor)
+    # Final resize to target size
+    resized_img = cropped_img.resize((target_size, target_size), Image.Resampling.LANCZOS)
+    # Make sure all coordinates are within bounds (0 to target_size)
+    # for bbox in new_bboxes:
+    #     bbox[0] = max(0, min(bbox[0], target_size - 1))
+    #     bbox[1] = max(0, min(bbox[1], target_size - 1))
+    #     bbox[2] = max(1, min(bbox[2], target_size))
+    #     bbox[3] = max(1, min(bbox[3], target_size))
+    return resized_img, new_bboxes
+def extract_moref(img, json_data, face_size_restriction=100):
+    """
+    Extract faces from an image based on bounding boxes in JSON data.
+    Makes each face square and resizes to 512x512.
+    Args:
+        img: PIL Image or image data
+        json_data: JSON object with 'bboxes' and 'crop' information
+    Returns:
+        List of PIL Images, each 512x512, containing extracted faces
+    """
+    # Ensure img is a PIL Image
+    try:
+        if not isinstance(img, Image.Image) and not isinstance(img, torch.Tensor) and not isinstance(img, JpegImageFile):
+            img = Image.open(BytesIO(img))
+        bboxes = json_data['bboxes']
+        # crop = json_data['crop']
+        # print("len of bboxes:", len(bboxes))
+        # Recalculate bounding boxes based on crop info
+        # new_bboxes = [recalculate_bbox(bbox, crop) for bbox in bboxes]
+        new_bboxes = bboxes
+        # any of the face is less than 100 * 100, we ignore this image
+        for bbox in new_bboxes:
+            x1, y1, x2, y2 = bbox
+            if x2 - x1 < face_size_restriction or y2 - y1 < face_size_restriction:
+                return []
+        # print("len of new_bboxes:", len(new_bboxes))
+        faces = []
+        for bbox in new_bboxes:
+            # print("processing bbox")
+            # Convert coordinates to integers
+            x1, y1, x2, y2 = map(int, bbox)
+            # Calculate width and height
+            width = x2 - x1
+            height = y2 - y1
+            # Make the bounding box square by expanding the shorter dimension
+            if width > height:
+                # Height is shorter, expand it
+                diff = width - height
+                y1 -= diff // 2
+                y2 += diff - (diff // 2)  # Handle odd differences
+            elif height > width:
+                # Width is shorter, expand it
+                diff = height - width
+                x1 -= diff // 2
+                x2 += diff - (diff // 2)  # Handle odd differences
+            # Ensure coordinates are within image boundaries
+            img_width, img_height = img.size
+            x1 = max(0, x1)
+            y1 = max(0, y1)
+            x2 = min(img_width, x2)
+            y2 = min(img_height, y2)
+            # Extract face region
+            face_region = img.crop((x1, y1, x2, y2))
+            # Resize to 512x512
+            face_region = face_region.resize((512, 512), Image.LANCZOS)
+            faces.append(face_region)
+        # print("len of faces:", len(faces))
+        return faces
+    except Exception as e:
+        print(f"Error processing image: {e}")
+        return []
+def general_face_preserving_resize(img, face_bboxes, target_size=512):
+    """
+    Resize image while ensuring all faces are preserved in the output.
+    Handles any number of faces (1-5).
+    Args:
+        img: PIL Image
+        face_bboxes: List of [x1, y1, x2, y2] face coordinates
+        target_size: Maximum dimension for resizing
+    Returns:
+        Tuple of (resized image, new_bboxes) or (None, None) if faces can't fit
+    """
+    # Find bounding region containing all faces
+    if not face_bboxes:
+        print("Warning: No face bounding boxes provided.")
+        return None, None
+    min_x1 = min(bbox[0] for bbox in face_bboxes)
+    min_y1 = min(bbox[1] for bbox in face_bboxes)
+    max_x2 = max(bbox[2] for bbox in face_bboxes)
+    max_y2 = max(bbox[3] for bbox in face_bboxes)
+    # Check for negative coordinates
+    if min_x1 < 0 or min_y1 < 0 or max_x2 < 0 or max_y2 < 0:
+        # print("Warning: Negative coordinates found in face bounding boxes.")
+        # return None, None
+        min_x1 = max(min_x1, 0)
+        min_y1 = max(min_y1, 0)
+    # Check if faces fit within image
+    face_width = max_x2 - min_x1
+    face_height = max_y2 - min_y1
+    if face_width > img.height or face_height > img.width:
+        # print("Warning: Faces are too large for the image dimensions.")
+        # return None, None
+        # Instead of returning None, we will crop the image to fit the faces
+        max_x2 = min(max_x2, img.width)
+        max_y2 = min(max_y2, img.height)
+        min_x1 = max(min_x1, 0)
+        min_y1 = max(min_y1, 0)
+    # Create a copy of face_bboxes for transformation
+    new_bboxes = []
+    for bbox in face_bboxes:
+        new_bboxes.append(list(map(int, bbox)))
+    # Choose cropping strategy based on image aspect ratio
+    if img.width > img.height:
+        # Crop width to make a square
+        square_size = img.height
+        # Calculate valid horizontal crop range
+        left_max = min_x1
+        right_min = max_x2 - square_size
+        if right_min <= left_max:
+            # We can find a valid crop window
+            start = random.randint(int(right_min), int(left_max)) if right_min < left_max else int(right_min)
+            start = max(0, min(start, img.width - square_size))
+        else:
+            # Faces are too far apart - use center of faces
+            face_center = (min_x1 + max_x2) // 2
+            start = max(0, min(face_center - (square_size // 2), img.width - square_size))
+        cropped_img = img.crop((start, 0, start + square_size, square_size))
+        # Adjust bounding box coordinates
+        for bbox in new_bboxes:
+            bbox[0] -= start
+            bbox[2] -= start
+    else:
+        # Crop height to make a square
+        square_size = img.width
+        # Calculate valid vertical crop range
+        top_max = min_y1
+        bottom_min = max_y2 - square_size
+        if bottom_min <= top_max:
+            start = random.randint(int(bottom_min), int(top_max)) if bottom_min < top_max else int(bottom_min)
+            start = max(0, min(start, img.height - square_size))
+        else:
+            face_center = (min_y1 + max_y2) // 2
+            start = max(0, min(face_center - (square_size // 2), img.height - square_size))
+        cropped_img = img.crop((0, start, square_size, start + square_size))
+        # Adjust bounding box coordinates
+        for bbox in new_bboxes:
+            bbox[1] -= start
+            bbox[3] -= start
+    # Calculate scale factor and adjust bounding boxes
+    scale_factor = target_size / square_size
+    for bbox in new_bboxes:
+        bbox[0] = int(bbox[0] * scale_factor)
+        bbox[1] = int(bbox[1] * scale_factor)
+        bbox[2] = int(bbox[2] * scale_factor)
+        bbox[3] = int(bbox[3] * scale_factor)
+    # Final resize to target size
+    resized_img = cropped_img.resize((target_size, target_size), Image.Resampling.LANCZOS)
+    # Make sure all coordinates are within bounds
+    for bbox in new_bboxes:
+        bbox[0] = max(0, min(bbox[0], target_size - 1))
+        bbox[1] = max(0, min(bbox[1], target_size - 1))
+        bbox[2] = max(1, min(bbox[2], target_size))
+        bbox[3] = max(1, min(bbox[3], target_size))
+    return resized_img, new_bboxes
+def horizontal_concat(images):
+    widths, heights = zip(*(img.size for img in images))
+    total_width = sum(widths)
+    max_height = max(heights)
+    new_im = Image.new('RGB', (total_width, max_height))
+    x_offset = 0
+    for img in images:
+        new_im.paste(img, (x_offset, 0))
+        x_offset += img.size[0]
+    return new_im
+def extract_object(birefnet, image):
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    input_images = transforms.ToTensor()(image).unsqueeze(0).to('cuda', dtype=torch.bfloat16)
+    # Prediction
+    with torch.no_grad(), autocast(dtype=torch.bfloat16):
+        preds = birefnet(input_images)[-1].sigmoid().cpu()
+    pred = preds[0].squeeze().float()
+    pred_pil = transforms.ToPILImage()(pred)
+    mask = pred_pil.resize(image.size)
+    # Create a binary mask (0 or 255)
+    binary_mask = mask.convert("L")
+    # Create a new image with black background
+    result = Image.new("RGB", image.size, (0, 0, 0))
+    # Paste the original image onto the black background using the mask
+    result.paste(image, (0, 0), binary_mask)
+    return result, mask
+class FaceExtractor:
+    def __init__(self):
+        self.model = insightface.app.FaceAnalysis(name = "antelopev2", root="./")
+        self.model.prepare(ctx_id=0, det_thresh=0.4)
+    def extract(self, image: Image.Image):
+        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        res = self.model.get(image_np)
+        if len(res) == 0:
+            return None, None
+        res = res[0]
+        # print(res.keys())
+        bbox = res["bbox"]
+        # print("len(bbox)", len(bbox))
+        moref = extract_moref(image, {"bboxes": [bbox]}, 1)
+        # print("len(moref)", len(moref))
+        return moref[0], res["embedding"]
+    def locate_bboxes(self, image: Image.Image):
+        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        res = self.model.get(image_np)
+        if len(res) == 0:
+            return None
+        bboxes = []
+        for r in res:
+            bbox = r["bbox"]
+            bboxes.append(bbox)
+        _, new_bboxes_ = general_face_preserving_resize(image, bboxes, 512)
+        # ensure the bbox is square
+        new_bboxes = []
+        for bbox in new_bboxes_:
+            x1, y1, x2, y2 = bbox
+            w = x2 - x1
+            h = y2 - y1
+            if w > h:
+                diff = w - h
+                y1 = max(0, y1 - diff // 2)
+                y2 = min(512, y2 + diff // 2 + diff % 2)
+            else:
+                diff = h - w
+                x1 = max(0, x1 - diff // 2)
+                x2 = min(512, x2 + diff // 2 + diff % 2)
+            new_bboxes.append([x1, y1, x2, y2])
+        return new_bboxes
+    def extract_refs(self, image: Image.Image):
+        """
+        Extracts reference faces from the image.
+        Returns a list of reference images and their arcface embeddings.
+        """
+        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        res = self.model.get(image_np)
+        if len(res) == 0:
+            return None, None
+        ref_imgs = []
+        arcface_embeddings = []
+        for r in res:
+            bbox = r["bbox"]
+            moref = extract_moref(image, {"bboxes": [bbox]}, 1)
+            ref_imgs.append(moref[0])
+            arcface_embeddings.append(r["embedding"])
+        return ref_imgs, arcface_embeddings

withanyone/flux/__pycache__/math.cpython-310.pyc ADDED Viewed

Binary file (2.03 kB). View file

withanyone/flux/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (14.3 kB). View file

withanyone/flux/__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (8.58 kB). View file

withanyone/flux/__pycache__/sampling.cpython-310.pyc ADDED Viewed

Binary file (4.12 kB). View file

withanyone/flux/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

withanyone/flux/math.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+from einops import rearrange
+from torch import Tensor
+import torch
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import seaborn as sns
+from torch import Tensor
+from matplotlib.colors import LinearSegmentedColormap
+from dataclasses import dataclass
+# a return class
+@dataclass
+class AttentionReturnQAndMAP:
+    result: Tensor
+    attention_map: Tensor
+    Q: Tensor
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask = None, token_aug_idx = -1, text_length = None, image_length = None, return_map = False) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v, mask)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)

withanyone/flux/model.py ADDED Viewed

	@@ -0,0 +1,610 @@

+from dataclasses import dataclass
+import torch
+from torch import Tensor, nn
+from .modules.layers import DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder, SingleStreamBlock, timestep_embedding, PerceiverAttentionCA
+from transformers import AutoTokenizer, AutoProcessor, SiglipModel
+import math
+from transformers import AutoModelForImageSegmentation
+from einops import rearrange
+from torchvision import transforms
+from PIL import Image
+from torch.cuda.amp import autocast
+def create_person_cross_attention_mask_varlen(
+    batch_size, img_len, id_len,
+    bbox_lists, original_width, original_height,
+    max_num_ids=2,  # Default to support 2 identities
+    vae_scale_factor=8, patch_size=2, num_heads = 24
+):
+    """
+    Create boolean attention masks limiting image tokens to interact only with corresponding person ID tokens
+    Parameters:
+    - batch_size: Number of samples in batch
+    - num_heads: Number of attention heads
+    - img_len: Length of image token sequence
+    - id_len: Length of EACH identity embedding (not total)
+    - bbox_lists: List where bbox_lists[i] contains all bboxes for batch i
+                  Each batch may have a different number of bboxes/identities
+    - max_num_ids: Maximum number of identities to support (for padding)
+    - original_width/height: Original image dimensions
+    - vae_scale_factor: VAE downsampling factor (default 8)
+    - patch_size: Patch size for token creation (default 2)
+    Returns:
+    - Boolean attention mask of shape [batch_size, num_heads, img_len, total_id_len]
+    """
+    # Total length of ID tokens based on maximum number of identities
+    total_id_len = max_num_ids * id_len
+    # Initialize mask to block all attention
+    mask = torch.zeros((batch_size, num_heads, img_len, total_id_len), dtype=torch.bool)
+    # Calculate VAE dimensions
+    latent_width = original_width // vae_scale_factor
+    latent_height = original_height // vae_scale_factor
+    patches_width = latent_width // patch_size
+    patches_height = latent_height // patch_size
+    # Convert boundary box to token indices
+    def bbox_to_token_indices(bbox):
+        x1, y1, x2, y2 = bbox
+        # Convert to patch space coordinates
+        if isinstance(x1, torch.Tensor):
+            x1_patch = max(0, int(x1.item()) // vae_scale_factor // patch_size)
+            y1_patch = max(0, int(y1.item()) // vae_scale_factor // patch_size)
+            x2_patch = min(patches_width, math.ceil(int(x2.item()) / vae_scale_factor / patch_size))
+            y2_patch = min(patches_height, math.ceil(int(y2.item()) / vae_scale_factor / patch_size))
+        elif isinstance(x1, int):
+            x1_patch = max(0, x1 // vae_scale_factor // patch_size)
+            y1_patch = max(0, y1 // vae_scale_factor // patch_size)
+            x2_patch = min(patches_width, math.ceil(x2 / vae_scale_factor / patch_size))
+            y2_patch = min(patches_height, math.ceil(y2 / vae_scale_factor / patch_size))
+        elif isinstance(x1, float):
+            x1_patch = max(0, int(x1) // vae_scale_factor // patch_size)
+            y1_patch = max(0, int(y1) // vae_scale_factor // patch_size)
+            x2_patch = min(patches_width, math.ceil(x2 / vae_scale_factor / patch_size))
+            y2_patch = min(patches_height, math.ceil(y2 / vae_scale_factor / patch_size))
+        else:
+            raise TypeError(f"Unsupported type: {type(x1)}")
+        # Create list of all token indices in this region
+        indices = []
+        for y in range(y1_patch, y2_patch):
+            for x in range(x1_patch, x2_patch):
+                idx = y * patches_width + x
+                indices.append(idx)
+        return indices
+    for b in range(batch_size):
+        # Get all bboxes for this batch item
+        batch_bboxes = bbox_lists[b] if b < len(bbox_lists) else []
+        # Process each bbox in the batch up to max_num_ids
+        for identity_idx, bbox in enumerate(batch_bboxes[:max_num_ids]):
+            # Get image token indices for this bbox
+            image_indices = bbox_to_token_indices(bbox)
+            # Calculate ID token slice for this identity
+            id_start = identity_idx * id_len
+            id_end = id_start + id_len
+            id_slice = slice(id_start, id_end)
+            # Enable attention between this region's image tokens and the identity's tokens
+            for h in range(num_heads):
+                for idx in image_indices:
+                    mask[b, h, idx, id_slice] = True
+    return mask
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+@dataclass
+class FluxParams:
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+class SiglipEmbedding(nn.Module):
+    def __init__(self, siglip_path = "google/siglip-base-patch16-256-i18n", use_matting=False):
+        super().__init__()
+        self.model = SiglipModel.from_pretrained(siglip_path).vision_model.to(torch.bfloat16)
+        self.processor = AutoProcessor.from_pretrained(siglip_path)
+        self.model.to(torch.cuda.current_device())
+        # BiRefNet matting setup
+        self.use_matting = use_matting
+        if self.use_matting:
+            self.birefnet = AutoModelForImageSegmentation.from_pretrained(
+                'briaai/RMBG-2.0', trust_remote_code=True).to(torch.cuda.current_device(), dtype=torch.bfloat16)
+            # Apply half precision to the entire model after loading
+            self.matting_transform = transforms.Compose([
+                # transforms.Resize((512, 512)),
+                transforms.ToTensor(),
+                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+            ])
+    def apply_matting(self, image):
+        """Apply BiRefNet matting to remove background from image"""
+        if not self.use_matting:
+            return image
+        # Convert to input format and move to GPU
+        input_image = self.matting_transform(image).unsqueeze(0).to(torch.cuda.current_device(), dtype=torch.bfloat16)
+        # Generate prediction
+        with torch.no_grad(), autocast(dtype=torch.bfloat16):
+            preds = self.birefnet(input_image)[-1].sigmoid().cpu()
+        # Process the mask
+        pred = preds[0].squeeze().float()
+        pred_pil = transforms.ToPILImage()(pred)
+        mask = pred_pil.resize(image.size)
+        binary_mask = mask.convert("L")
+        # Create a new image with black background
+        result = Image.new("RGB", image.size, (0, 0, 0))
+        result.paste(image, (0, 0), binary_mask)
+        return result
+    def get_id_embedding(self, refimage):
+        '''
+        refimage is a list (batch) of list (num of person) of PIL images
+        considering the whole batch, the number of person is fixed
+        '''
+        siglip_embedding = []
+        if isinstance(refimage, list):
+            batch_size = len(refimage)
+            for batch_idx, refimage_batch in enumerate(refimage):
+                # Apply matting if enabled
+                if self.use_matting:
+                    processed_images = [self.apply_matting(img) for img in refimage_batch]
+                else:
+                    processed_images = refimage_batch
+                pixel_values = self.processor(images=processed_images, return_tensors="pt").pixel_values
+                # device
+                pixel_values = pixel_values.to(torch.cuda.current_device(), dtype=torch.bfloat16)
+                last_hidden_state = self.model(pixel_values).last_hidden_state # 2, 256 768
+                # pooled_output = self.model(pixel_values).pooler_output # 2, 768
+                siglip_embedding.append(last_hidden_state)
+                # siglip_embedding.append(pooled_output) # 2, 768
+            siglip_embedding = torch.stack(siglip_embedding, dim=0) # shape ([batch_size, num_of_person, 256, 768])
+            if batch_size < 4:
+                # run additional times to avoid the first time cuda memory allocation overhead
+                for _ in range(4 - batch_size):
+                    pixel_values = self.processor(images=processed_images, return_tensors="pt").pixel_values
+                    # device
+                    pixel_values = pixel_values.to(torch.cuda.current_device(), dtype=torch.bfloat16)
+                    last_hidden_state = self.model(pixel_values).last_hidden_state
+        elif isinstance(refimage, torch.Tensor):
+            # refimage is a tensor of shape (batch_size, num_of_person, 3, H, W)
+            batch_size, num_of_person, C, H, W = refimage.shape
+            refimage = refimage.view(batch_size * num_of_person, C, H, W)
+            refimage = refimage.to(torch.cuda.current_device(), dtype=torch.bfloat16)
+            last_hidden_state = self.model(refimage).last_hidden_state
+            siglip_embedding = last_hidden_state.view(batch_size, num_of_person, 256, 768)
+        return siglip_embedding
+    def forward(self, refimage):
+        return self.get_id_embedding(refimage)
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(self, params: FluxParams):
+        super().__init__()
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        )
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.gradient_checkpointing = False
+        # use cross attention
+        self.ipa_arc = nn.ModuleList([
+            PerceiverAttentionCA(dim=self.hidden_size, kv_dim=self.hidden_size, heads=self.num_heads)
+            for _ in range(self.params.depth_single_blocks + self.params.depth)
+        ])
+        self.ipa_sig = nn.ModuleList([
+            PerceiverAttentionCA(dim=self.hidden_size, kv_dim=self.hidden_size, heads=self.num_heads)
+            for _ in range(self.params.depth_single_blocks + self.params.depth)
+        ])
+        self.arcface_in_arc = nn.Sequential(
+            nn.Linear(512, 4 * self.hidden_size, bias=True),
+            nn.GELU(),
+            nn.LayerNorm(4 * self.hidden_size),
+            nn.Linear(4 * self.hidden_size, 8 * self.hidden_size, bias=True),
+        )
+        self.arcface_in_sig = nn.Sequential(
+            nn.Linear(512, 4 * self.hidden_size, bias=True),
+            nn.GELU(),
+            nn.LayerNorm(4 * self.hidden_size),
+            nn.Linear(4 * self.hidden_size, 8 * self.hidden_size, bias=True),
+        )
+        self.siglip_in_sig = nn.Sequential(
+            nn.Linear(768, self.hidden_size, bias=True),
+            nn.GELU(),
+            nn.LayerNorm(self.hidden_size),
+            nn.Linear(self.hidden_size, self.hidden_size, bias=True),
+        )
+    def lq_in_arc(self, txt_lq, siglip_embeddings, arcface_embeddings):
+        """
+        Process the siglip and arcface embeddings.
+        """
+        # shape of arcface: (num_refs, bs, 512)
+        arcface_embeddings = self.arcface_in_arc(arcface_embeddings)
+        # shape of arcface: (num_refs, bs,  4*hidden_size)
+        # 4*hidden_size -> 4 tokens of hidden_size
+        arcface_embeddings =  rearrange(arcface_embeddings, 'b n (t d) -> b n t d', t=8, d=self.hidden_size)
+        # (num_ref, tokens, hidden_size) -> (bs, num_refs*tokens, hidden_size)
+        arcface_embeddings = arcface_embeddings.permute(1, 0, 2, 3) # (n, b, t, d) -> (b, n, t, d)
+        arcface_embeddings = rearrange(arcface_embeddings, 'b n t d -> b (n t) d')
+        return arcface_embeddings
+    def lq_in_sig(self, txt_lq, siglip_embeddings, arcface_embeddings):
+        """
+        Process the siglip and arcface embeddings.
+        """
+        # shape of arcface: (num_refs, bs, 512)
+        arcface_embeddings = self.arcface_in_sig(arcface_embeddings)
+        arcface_embeddings =  rearrange(arcface_embeddings, 'b n (t d) -> b n t d', t=8, d=self.hidden_size)
+        # (num_ref, tokens, hidden_size) -> (bs, num_refs*tokens, hidden_size)
+        arcface_embeddings = arcface_embeddings.permute(1, 0, 2, 3) # (n, b, t, d) -> (b, n, t, d)
+        siglip_embeddings = self.siglip_in_sig(siglip_embeddings)  # (bs, num_refs, 256, 768) -> (bs, num_refs, 4*hidden_size)
+        # concat in token dimension
+        arcface_embeddings = torch.cat((siglip_embeddings, arcface_embeddings), dim=2)  # (bs, num_refs, 4, hidden_size) cat (bs, num_refs, 4, hidden_size) -> (bs, num_refs, 8, hidden_size)
+        arcface_embeddings = rearrange(arcface_embeddings, 'b n t d -> b (n t) d')
+        return arcface_embeddings
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    @property
+    def attn_processors(self):
+        # set recursively
+        processors = {}  # type: dict[str, nn.Module]
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+        siglip_embeddings: Tensor | None = None, # (bs, num_refs, 256, 768)
+        arcface_embeddings: Tensor | None = None, # (bs, num_refs, 512)
+        bbox_lists: list | None = None, # list of list of bboxes, bbox_lists[i] is for the i-th batch, each has different number of bboxes (ids), which should align with the dim1 of arcface_embeddings. This is used to replace bbox_A and bbox_B, which should be discarded, but remained for compatibility.
+        use_mask: bool = True,
+        id_weight: float = 1.0,
+        siglip_weight: float = 1.0,
+        siglip_mask = None,
+        arc_mask = None,
+        img_height: int = 512,
+        img_width: int = 512,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        text_length = txt.shape[1]
+        img_length = img.shape[1]
+        img_end = img.shape[1]
+        use_ip = arcface_embeddings is not None
+        if use_ip:
+            id_embeddings = self.lq_in_arc(None, siglip_embeddings, arcface_embeddings)
+            siglip_embeddings = self.lq_in_sig(None, siglip_embeddings, arcface_embeddings)
+            text_length = txt.shape[1]  # update text_length after adding learnable query
+            # 8 tokens for arcface, 256 tokens for siglip
+            id_len = 8
+            siglip_len = 256 + 8
+            if bbox_lists is not None and use_mask and (arc_mask is None or siglip_mask is None):
+                arc_mask = create_person_cross_attention_mask_varlen(
+                    batch_size=img.shape[0],
+                    num_heads=self.params.num_heads,
+                    # txt_len=text_length,
+                    img_len=img_length,
+                    id_len=id_len,
+                    bbox_lists=bbox_lists,
+                    max_num_ids=len(bbox_lists[0]),
+                    original_width=img_width,
+                    original_height= img_height,
+                ).to(img.device)
+                siglip_mask = create_person_cross_attention_mask_varlen(
+                    batch_size=img.shape[0],
+                    num_heads=self.params.num_heads,
+                    # txt_len=text_length,
+                    img_len=img_length,
+                    id_len=siglip_len,
+                    bbox_lists=bbox_lists,
+                    max_num_ids=len(bbox_lists[0]),
+                    original_width=img_width,
+                    original_height= img_height,
+                ).to(img.device)
+        else:
+            arc_mask = None
+            siglip_mask = None
+            # update text_ids and id_ids
+            txt_ids =  torch.zeros((txt.shape[0], text_length, 3)).to(img_ids.device)  # (bs, T, 3)
+        ids = torch.cat((txt_ids, img_ids), dim=1)  # (bs, T + I + ID, 3)
+        pe = self.pe_embedder(ids)
+        # ipa
+        ipa_idx = 0
+        for index_block, block in enumerate(self.double_blocks):
+            if self.training and self.gradient_checkpointing:
+                img, txt = torch.utils.checkpoint.checkpoint(
+                    block,
+                    img=img,
+                    txt=txt,
+                    vec=vec,
+                    pe=pe,
+                    # mask=mask,
+                    text_length=text_length,
+                    image_length=img_length,
+                    # return_map = False,
+                    use_reentrant=False,
+                )
+            else:
+                img, txt= block(
+                    img=img,
+                    txt=txt,
+                    vec=vec,
+                    pe=pe,
+                    text_length=text_length,
+                    image_length=img_length,
+                    # return_map=False,
+                )
+            if use_ip:
+                img = img + id_weight * self.ipa_arc[ipa_idx](id_embeddings, img, mask=arc_mask) + siglip_weight * self.ipa_sig[ipa_idx](siglip_embeddings, img, mask=siglip_mask)
+                ipa_idx += 1
+        # for block in self.single_blocks:
+        img = torch.cat((txt, img), 1)
+        for index_block, block in enumerate(self.single_blocks):
+            if self.training and self.gradient_checkpointing:
+                img = torch.utils.checkpoint.checkpoint(
+                    block,
+                    img, vec=vec, pe=pe, #mask=mask,
+                    text_length=text_length,
+                    image_length=img_length,
+                    return_map=False,
+                    use_reentrant=False
+                )
+            else:
+                img = block(img, vec=vec, pe=pe,text_length=text_length, image_length=img_length, return_map=False)
+            # IPA
+            if use_ip:
+                txt, real_img = img[:, :text_length, :], img[:, text_length:, :]
+                id_ca = id_weight * self.ipa_arc[ipa_idx](id_embeddings, real_img, mask=arc_mask) + siglip_weight * self.ipa_sig[ipa_idx](siglip_embeddings, real_img, mask=siglip_mask)
+                real_img = real_img + id_ca
+                img = torch.cat((txt, real_img), dim=1)
+                ipa_idx += 1
+        img = img[:, txt.shape[1] :, ...]
+        # index img
+        img = img[:, :img_end, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img

withanyone/flux/modules/__pycache__/autoencoder.cpython-310.pyc ADDED Viewed

Binary file (9.09 kB). View file

withanyone/flux/modules/__pycache__/conditioner.cpython-310.pyc ADDED Viewed

Binary file (1.52 kB). View file

withanyone/flux/modules/__pycache__/layers.cpython-310.pyc ADDED Viewed

Binary file (18 kB). View file

withanyone/flux/modules/autoencoder.py ADDED Viewed

	@@ -0,0 +1,327 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))

withanyone/flux/modules/conditioner.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch import Tensor, nn
+from transformers import (CLIPTextModel, CLIPTokenizer, T5EncoderModel,
+                          T5Tokenizer)
+class HFEmbedder(nn.Module):
+    def __init__(self, version: str, max_length: int, **hf_kwargs):
+        super().__init__()
+        self.is_clip = "clip" in version.lower()
+        self.max_length = max_length
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        if self.is_clip:
+            self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version, max_length=max_length)
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(version, **hf_kwargs)
+        else:
+            self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version, max_length=max_length)
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(version, **hf_kwargs)
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, text: list[str]) -> Tensor:
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]

withanyone/flux/modules/layers.py ADDED Viewed

	@@ -0,0 +1,530 @@

+import math
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+# from ..math import attention, rope
+from ..math import rope
+from ..math import  attention
+# from ..math import attention
+import torch.nn.functional as F
+TOKEN_AUG_IDX = 2048
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        t.device
+    )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+def reshape_tensor(x, heads):
+    # print("x in reshape_tensor", x.shape)
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttentionCA(nn.Module):
+    def __init__(self, *, dim=3072, dim_head=64, heads=16, kv_dim=2048):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim if kv_dim is None else kv_dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim if kv_dim is None else kv_dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents, mask=None):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        # print("x, latents in PerceiverAttentionCA", x.shape, latents.shape)
+        b, seq_len, _ = latents.shape
+        q = self.to_q(latents)
+        k, v = self.to_kv(x).chunk(2, dim=-1)
+        # print("q, k, v in PerceiverAttentionCA", q.shape, k.shape, v.shape)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # # attention
+        # scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        # weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        # print("is there any nan in weight:", torch.isnan(weight).any())
+        # if mask is not None:
+        #     # Mask shape should be [batch_size, num_heads, q_len, kv_len]
+        #     # weight = weight.masked_fill(mask == 0, float("-inf"))
+        #     if mask.dtype == torch.bool:
+        #         # Boolean mask: False values are masked out
+        #         # print("Got boolean mask")
+        #         weight = weight.masked_fill(~mask, -float('inf'))
+        #     else:
+        #         # Float mask: values are added directly to scores
+        #         weight = weight + mask
+        # print("is there any nan in weight after mask:", torch.isnan(weight).any())
+        # weight = torch.softmax(weight, dim=-1)
+        # print("is there any nan in weight after softmax:", torch.isnan(weight).any())
+        # out = weight @ v
+        # use sdpa
+        # if mask is not None:
+        #     print("mask shape in PerceiverAttentionCA", mask.shape)
+        out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        out = out.permute(0, 2, 1, 3).reshape(b, seq_len, -1)
+        return self.to_out(out)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return ((x * rrms) * self.scale.float()).to(dtype=x_dtype)
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class LoRALinearLayer(nn.Module):
+    def __init__(self, in_features, out_features, rank=4, network_alpha=None, device=None, dtype=None):
+        super().__init__()
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+    def forward(self, hidden_states):
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.to(orig_dtype)
+class FLuxSelfAttnProcessor:
+    def __call__(self, attn, x, pe, **attention_kwargs):
+        qkv = attn.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = attn.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = attn.proj(x)
+        return x
+class LoraFluxAttnProcessor(nn.Module):
+    def __init__(self, dim: int, rank=4, network_alpha=None, lora_weight=1):
+        super().__init__()
+        self.qkv_lora = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
+        self.proj_lora = LoRALinearLayer(dim, dim, rank, network_alpha)
+        self.lora_weight = lora_weight
+    def __call__(self, attn, x, pe, **attention_kwargs):
+        qkv = attn.qkv(x) + self.qkv_lora(x) * self.lora_weight
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = attn.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = attn.proj(x) + self.proj_lora(x) * self.lora_weight
+        return x
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward():
+        pass
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, rank=4, network_alpha=None, lora_weight=1):
+        super().__init__()
+        self.qkv_lora1 = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
+        self.proj_lora1 = LoRALinearLayer(dim, dim, rank, network_alpha)
+        self.qkv_lora2 = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
+        self.proj_lora2 = LoRALinearLayer(dim, dim, rank, network_alpha)
+        self.lora_weight = lora_weight
+    def forward(self, attn, img, txt, vec, pe, mask, text_length, image_length, **attention_kwargs):
+        img_mod1, img_mod2 = attn.img_mod(vec)
+        txt_mod1, txt_mod2 = attn.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = attn.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = attn.img_attn.qkv(img_modulated) + self.qkv_lora1(img_modulated) * self.lora_weight
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = attn.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = attn.txt_attn.qkv(txt_modulated) + self.qkv_lora2(txt_modulated) * self.lora_weight
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn1 = attention(q, k, v, pe=pe, mask=mask, token_aug_idx=TOKEN_AUG_IDX, text_length=text_length, image_length=image_length)
+        txt_attn, img_attn = attn1[:, : txt.shape[1]], attn1[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * (attn.img_attn.proj(img_attn) + self.proj_lora1(img_attn) * self.lora_weight)
+        img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * (attn.txt_attn.proj(txt_attn) + self.proj_lora2(txt_attn) * self.lora_weight)
+        txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+class DoubleStreamBlockProcessor:
+    def __call__(self, attn, img, txt, vec, pe, mask, text_length, image_length, **attention_kwargs):
+        img_mod1, img_mod2 = attn.img_mod(vec)
+        txt_mod1, txt_mod2 = attn.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = attn.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = attn.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
+        img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = attn.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = attn.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
+        txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn1 = attention(q, k, v, pe=pe, mask=attention_kwargs.get("mask"), token_aug_idx=TOKEN_AUG_IDX,text_length=text_length, image_length=image_length)
+        txt_attn, img_attn = attn1[:, : txt.shape[1]], attn1[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * attn.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * attn.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.head_dim = hidden_size // num_heads
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        processor = DoubleStreamBlockProcessor()
+        self.set_processor(processor)
+    def set_processor(self, processor) -> None:
+        self.processor = processor
+    def get_processor(self):
+        return self.processor
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        vec: Tensor,
+        pe: Tensor,
+        image_proj: Tensor = None,
+        ip_scale: float =1.0,
+        mask: Tensor | None = None,
+        text_length: int = None,
+        image_length: int = None,
+        return_map: bool = False,
+        **attention_kwargs
+    ) -> tuple[Tensor, Tensor]:
+        if image_proj is None:
+            return self.processor(self, img, txt, vec, pe, mask, text_length, image_length)
+        else:
+            return self.processor(self, img, txt, vec, pe, mask, text_length, image_length, image_proj, ip_scale)
+class SingleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, rank: int = 4, network_alpha = None, lora_weight: float = 1):
+        super().__init__()
+        self.qkv_lora = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
+        self.proj_lora = LoRALinearLayer(15360, dim, rank, network_alpha)
+        self.lora_weight = lora_weight
+    def forward(self, attn: nn.Module, x: Tensor, vec: Tensor, pe: Tensor, mask = None, text_length = None, image_length = None, return_map=False) -> Tensor:
+        mod, _ = attn.modulation(vec)
+        x_mod = (1 + mod.scale) * attn.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(attn.linear1(x_mod), [3 * attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
+        qkv = qkv + self.qkv_lora(x_mod) * self.lora_weight
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        q, k = attn.norm(q, k, v)
+        # compute attention
+        attn_1 = attention(q, k, v, pe=pe, mask=mask, token_aug_idx=TOKEN_AUG_IDX,text_length=text_length, image_length=image_length)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = attn.linear2(torch.cat((attn_1, attn.mlp_act(mlp)), 2))
+        output = output + self.proj_lora(torch.cat((attn_1, attn.mlp_act(mlp)), 2)) * self.lora_weight
+        output = x + mod.gate * output
+        return output
+class SingleStreamBlockProcessor:
+    def __call__(self, attn: nn.Module, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor, text_length, image_length, return_map=False, **attention_kwargs) -> Tensor:
+        mod, _ = attn.modulation(vec)
+        x_mod = (1 + mod.scale) * attn.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(attn.linear1(x_mod), [3 * attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        q, k = attn.norm(q, k, v)
+        # compute attention
+        attn_1 = attention(q, k, v, pe=pe, mask=mask, token_aug_idx=TOKEN_AUG_IDX,text_length=text_length, image_length=image_length)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = attn.linear2(torch.cat((attn_1, attn.mlp_act(mlp)), 2))
+        output = x + mod.gate * output
+        return output
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(self.head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+        processor = SingleStreamBlockProcessor()
+        self.set_processor(processor)
+    def set_processor(self, processor) -> None:
+        self.processor = processor
+    def get_processor(self):
+        return self.processor
+    def forward(
+        self,
+        x: Tensor,
+        vec: Tensor,
+        pe: Tensor,
+        image_proj: Tensor | None = None,
+        ip_scale: float = 1.0,
+        mask: Tensor | None = None,
+        text_length: int | None = None,
+        image_length: int | None = None,
+        return_map: bool = False,
+    ) -> Tensor:
+        if image_proj is None:
+            return self.processor(self, x, vec, pe, mask, text_length=text_length, image_length=image_length)
+        else:
+            return self.processor(self, x, vec, pe, mask, image_proj, ip_scale, text_length=text_length, image_length=image_length)
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x

withanyone/flux/pipeline.py ADDED Viewed

	@@ -0,0 +1,406 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Literal
+import torch
+from einops import rearrange
+from PIL import ExifTags, Image
+import torchvision.transforms.functional as TVF
+from withanyone.flux.modules.layers import (
+    DoubleStreamBlockLoraProcessor,
+    DoubleStreamBlockProcessor,
+    SingleStreamBlockLoraProcessor,
+    SingleStreamBlockProcessor,
+)
+from withanyone.flux.sampling import denoise, get_noise, get_schedule, prepare, unpack
+from withanyone.flux.util import (
+    load_ae,
+    load_clip,
+    load_flow_model_no_lora,
+    load_flow_model_diffusers,
+    load_t5,
+)
+from withanyone.flux.model import SiglipEmbedding, create_person_cross_attention_mask_varlen
+def preprocess_ref(raw_image: Image.Image, long_size: int = 512):
+    image_w, image_h = raw_image.size
+    if image_w >= image_h:
+        new_w = long_size
+        new_h = int((long_size / image_w) * image_h)
+    else:
+        new_h = long_size
+        new_w = int((long_size / image_h) * image_w)
+    raw_image = raw_image.resize((new_w, new_h), resample=Image.LANCZOS)
+    target_w = new_w // 16 * 16
+    target_h = new_h // 16 * 16
+    left = (new_w - target_w) // 2
+    top = (new_h - target_h) // 2
+    right = left + target_w
+    bottom = top + target_h
+    raw_image = raw_image.crop((left, top, right, bottom))
+    raw_image = raw_image.convert("RGB")
+    return raw_image
+from io import BytesIO
+import insightface
+import numpy as np
+class FaceExtractor:
+    def __init__(self, model_path = "./"):
+        self.model = insightface.app.FaceAnalysis(name = "antelopev2", root=model_path, providers=['CUDAExecutionProvider'])
+        self.model.prepare(ctx_id=0, det_thresh=0.45)
+    def extract_moref(self, img, bboxes, face_size_restriction=1):
+        """
+        Extract faces from an image based on bounding boxes in JSON data.
+        Makes each face square and resizes to 512x512.
+        Args:
+            img: PIL Image or image data
+            json_data: JSON object with 'bboxes' and 'crop' information
+        Returns:
+            List of PIL Images, each 512x512, containing extracted faces
+        """
+        # Ensure img is a PIL Image
+        try:
+            if not isinstance(img, Image.Image) and not isinstance(img, torch.Tensor):
+                img = Image.open(BytesIO(img))
+            # bboxes = json_data['bboxes']
+            # crop = json_data['crop']
+            # print("len of bboxes:", len(bboxes))
+            # Recalculate bounding boxes based on crop info
+            # new_bboxes = [recalculate_bbox(bbox, crop) for bbox in bboxes]
+            new_bboxes = bboxes
+            # any of the face is less than 100 * 100, we ignore this image
+            for bbox in new_bboxes:
+                x1, y1, x2, y2 = bbox
+                if x2 - x1 < face_size_restriction or y2 - y1 < face_size_restriction:
+                    return []
+            # print("len of new_bboxes:", len(new_bboxes))
+            faces = []
+            for bbox in new_bboxes:
+                # print("processing bbox")
+                # Convert coordinates to integers
+                x1, y1, x2, y2 = map(int, bbox)
+                # Calculate width and height
+                width = x2 - x1
+                height = y2 - y1
+                # Make the bounding box square by expanding the shorter dimension
+                if width > height:
+                    # Height is shorter, expand it
+                    diff = width - height
+                    y1 -= diff // 2
+                    y2 += diff - (diff // 2)  # Handle odd differences
+                elif height > width:
+                    # Width is shorter, expand it
+                    diff = height - width
+                    x1 -= diff // 2
+                    x2 += diff - (diff // 2)  # Handle odd differences
+                # Ensure coordinates are within image boundaries
+                img_width, img_height = img.size
+                x1 = max(0, x1)
+                y1 = max(0, y1)
+                x2 = min(img_width, x2)
+                y2 = min(img_height, y2)
+                # Extract face region
+                face_region = img.crop((x1, y1, x2, y2))
+                # Resize to 512x512
+                face_region = face_region.resize((512, 512), Image.LANCZOS)
+                faces.append(face_region)
+            # print("len of faces:", len(faces))
+            return faces
+        except Exception as e:
+            print(f"Error processing image: {e}")
+            return []
+    def __call__(self, img):
+        # if np, get PIL, else, get np
+        if isinstance(img, torch.Tensor):
+            img_np = img.cpu().numpy()
+            img_pil = Image.fromarray(img_np)
+        elif isinstance(img, Image.Image):
+            img_pil = img
+            img_np = np.array(img)
+        elif isinstance(img, np.ndarray):
+            img_np = img
+            img_pil = Image.fromarray(img)
+        else:
+            raise ValueError("Unsupported image format. Please provide a PIL Image or numpy array.")
+        # Detect faces in the image
+        faces = self.model.get(img_np)
+        # use one
+        if len(faces) > 0:
+            bboxes = []
+            face = faces[0]
+            bbox = face.bbox.astype(int)
+            bboxes.append(bbox)
+            return self.extract_moref(img_pil, bboxes)[0]
+        else:
+            print("Warning: No faces detected in the image.")
+            return img_pil
+class WithAnyonePipeline:
+    def __init__(
+        self,
+        model_type: str,
+        ipa_path: str,
+        device: torch.device,
+        offload: bool = False,
+        only_lora: bool = False,
+        no_lora: bool = False,
+        lora_rank: int = 16,
+        face_extractor = None,
+        additional_lora_ckpt: str = None,
+        lora_weight: float = 1.0,
+        clip_path: str = "openai/clip-vit-large-patch14",
+        t5_path: str = "xlabs-ai/xflux_text_encoders",
+        flux_path: str = "black-forest-labs/FLUX.1-dev",
+        siglip_path: str = "google/siglip-base-patch16-256-i18n",
+    ):
+        self.device = device
+        self.offload = offload
+        self.model_type = model_type
+        self.clip = load_clip(clip_path, self.device)
+        self.t5 = load_t5(t5_path, self.device, max_length=512)
+        self.ae = load_ae(flux_path, model_type, device="cpu" if offload else self.device)
+        self.use_fp8 = "fp8" in model_type
+        if additional_lora_ckpt is not None:
+            self.model = load_flow_model_diffusers(
+                model_type,
+                flux_path,
+                ipa_path,
+                device="cpu" if offload else self.device,
+                lora_rank=lora_rank,
+                use_fp8=self.use_fp8,
+                additional_lora_ckpt=additional_lora_ckpt,
+                lora_weight=lora_weight,
+            ).to("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.model = load_flow_model_no_lora(
+                model_type,
+                flux_path,
+                ipa_path,
+                device="cpu" if offload else self.device,
+                use_fp8=self.use_fp8
+            )
+        if face_extractor is not None:
+            self.face_extractor = face_extractor
+        else:
+            self.face_extractor = FaceExtractor()
+        self.siglip = SiglipEmbedding(siglip_path=siglip_path)
+    def load_ckpt(self, ckpt_path):
+        if ckpt_path is not None:
+            from safetensors.torch import load_file as load_sft
+            print("Loading checkpoint to replace old keys")
+            # load_sft doesn't support torch.device
+            if ckpt_path.endswith('safetensors'):
+                sd = load_sft(ckpt_path, device='cpu')
+                missing, unexpected = self.model.load_state_dict(sd, strict=False, assign=True)
+            else:
+                dit_state = torch.load(ckpt_path, map_location='cpu')
+                sd = {}
+                for k in dit_state.keys():
+                    sd[k.replace('module.','')] = dit_state[k]
+                missing, unexpected = self.model.load_state_dict(sd, strict=False, assign=True)
+                self.model.to(str(self.device))
+            print(f"missing keys: {missing}\n\n\n\n\nunexpected keys: {unexpected}")
+    def __call__(
+        self,
+        prompt: str,
+        width: int = 512,
+        height: int = 512,
+        guidance: float = 4,
+        num_steps: int = 50,
+        seed: int = 123456789,
+        **kwargs
+    ):
+        width = 16 * (width // 16)
+        height = 16 * (height // 16)
+        device_type = self.device if isinstance(self.device, str) else self.device.type
+        if device_type == "mps":
+            device_type = "cpu"  # for support macos mps
+        with torch.autocast(enabled=self.use_fp8, device_type=device_type, dtype=torch.bfloat16):
+            return self.forward(
+                prompt,
+                width,
+                height,
+                guidance,
+                num_steps,
+                seed,
+                **kwargs
+            )
+    @torch.inference_mode
+    def forward(
+        self,
+        prompt: str,
+        width: int,
+        height: int,
+        guidance: float,
+        num_steps: int,
+        seed: int,
+        ref_imgs: list[Image.Image] | None = None,
+        arcface_embeddings: list[torch.Tensor] = None,
+        bboxes = None,
+        id_weight: float = 1.0,
+        siglip_weight: float = 1.0,
+    ):
+        x = get_noise(
+            1, height, width, device=self.device,
+            dtype=torch.bfloat16, seed=seed
+        )
+        timesteps = get_schedule(
+            num_steps,
+            (width // 8) * (height // 8) // (16 * 16),
+            shift=True,
+        )
+        if self.offload:
+            self.ae.encoder = self.ae.encoder.to(self.device)
+        if ref_imgs is None:
+            siglip_embeddings = None
+        else:
+            siglip_embeddings = self.siglip(ref_imgs).to(self.device, torch.bfloat16).permute(1,0,2,3)
+            # num_ref, (1), n, d
+        if arcface_embeddings is not None:
+            arcface_embeddings =  arcface_embeddings.unsqueeze(1)
+            # num_ref, 1, 512
+            arcface_embeddings = arcface_embeddings.to(self.device, torch.bfloat16)
+        if self.offload:
+            self.offload_model_to_cpu(self.ae.encoder)
+            self.t5, self.clip = self.t5.to(self.device), self.clip.to(self.device)
+        inp_cond = prepare(t5=self.t5, clip=self.clip,img=x,prompt=prompt
+                           )
+        if self.offload:
+            self.offload_model_to_cpu(self.t5, self.clip)
+            self.model = self.model.to(self.device)
+        img = inp_cond["img"]
+        img_length = img.shape[1]
+        ##### create mask for siglip and arcface #####
+        if bboxes is not None:
+            arc_mask = create_person_cross_attention_mask_varlen(
+                    batch_size=img.shape[0],
+                    # num_heads=self.params.num_heads,
+                    # txt_len=text_length,
+                    img_len=img_length,
+                    id_len=8,
+                    bbox_lists=bboxes,
+                    max_num_ids=len(bboxes[0]),
+                    original_width=width,
+                    original_height= height,
+                ).to(img.device)
+            siglip_mask = create_person_cross_attention_mask_varlen(
+                batch_size=img.shape[0],
+                # num_heads=self.params.num_heads,
+                # txt_len=text_length,
+                img_len=img_length,
+                id_len=256+8,
+                bbox_lists=bboxes,
+                max_num_ids=len(bboxes[0]),
+                original_width=width,
+                original_height= height,
+            ).to(img.device)
+        results = denoise(
+            self.model,
+            **inp_cond,
+            timesteps=timesteps,
+            guidance=guidance,
+            arcface_embeddings=arcface_embeddings,
+            siglip_embeddings=siglip_embeddings,
+            bboxes=bboxes,
+            id_weight=id_weight,
+            siglip_weight=siglip_weight,
+            img_height=height,
+            img_width=width,
+            arc_mask=arc_mask if bboxes is not None else None,
+            siglip_mask=siglip_mask if bboxes is not None else None,
+        )
+        x = results
+        if self.offload:
+            self.offload_model_to_cpu(self.model)
+            self.ae.decoder.to(x.device)
+        x = unpack(x.float(), height, width)
+        x = self.ae.decode(x)
+        self.offload_model_to_cpu(self.ae.decoder)
+        x1 = x.clamp(-1, 1)
+        x1 = rearrange(x1[-1], "c h w -> h w c")
+        output_img = Image.fromarray((127.5 * (x1 + 1.0)).cpu().byte().numpy())
+        return output_img
+    def offload_model_to_cpu(self, *models):
+        if not self.offload: return
+        for model in models:
+            model.cpu()
+            torch.cuda.empty_cache()

withanyone/flux/sampling.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import math
+from typing import Literal
+import torch
+from einops import rearrange, repeat
+from torch import Tensor
+from tqdm import tqdm
+from .model import Flux
+from .modules.conditioner import HFEmbedder
+def get_noise(
+    num_samples: int,
+    height: int,
+    width: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    seed: int,
+):
+    return torch.randn(
+        num_samples,
+        16,
+        # allow for packing
+        2 * math.ceil(height / 16),
+        2 * math.ceil(width / 16),
+        device=device,
+        dtype=dtype,
+        generator=torch.Generator(device=device).manual_seed(seed),
+    )
+def prepare(
+    t5: HFEmbedder,
+    clip: HFEmbedder,
+    img: Tensor,
+    prompt: str | list[str],
+) -> dict[str, Tensor]:
+    bs, c, h, w = img.shape
+    if bs == 1 and not isinstance(prompt, str):
+        bs = len(prompt)
+    img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    if img.shape[0] == 1 and bs > 1:
+        img = repeat(img, "1 ... -> bs ...", bs=bs)
+    img_ids = torch.zeros(h // 2, w // 2, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
+    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    txt = t5(prompt)
+    if txt.shape[0] == 1 and bs > 1:
+        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
+    txt_ids = torch.zeros(bs, txt.shape[1], 3)
+    vec = clip(prompt)
+    if vec.shape[0] == 1 and bs > 1:
+        vec = repeat(vec, "1 ... -> bs ...", bs=bs)
+    return {
+        "img": img,
+        "img_ids": img_ids.to(img.device),
+        "txt": txt.to(img.device),
+        "txt_ids": txt_ids.to(img.device),
+        "vec": vec.to(img.device),
+    }
+def time_shift(mu: float, sigma: float, t: Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(
+    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+):
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # eastimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+def denoise(
+    model: Flux,
+    # model input
+    img: Tensor,
+    img_ids: Tensor,
+    txt: Tensor,
+    txt_ids: Tensor,
+    vec: Tensor,
+    timesteps: list[float],
+    guidance: float = 4.0,
+    arcface_embeddings = None,
+    siglip_embeddings = None,
+    bboxes: Tensor = None,
+    id_weight: float = 1.0,  # weight for identity embeddings
+    siglip_weight: float = 1.0,  # weight for siglip embeddings
+    img_height: int = 512,
+    img_width: int = 512,
+    arc_mask = None,
+    siglip_mask = None,
+):
+    i = 0
+    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    for t_curr, t_prev in tqdm(zip(timesteps[:-1], timesteps[1:]), total=len(timesteps) - 1):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            siglip_embeddings=siglip_embeddings,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec,
+            arcface_embeddings=arcface_embeddings,
+            bbox_lists=bboxes,
+            id_weight=id_weight,
+            siglip_weight=siglip_weight,
+            img_height=img_height,
+            img_width=img_width,
+            arc_mask=arc_mask,
+            siglip_mask=siglip_mask,
+        )
+        img = img + (t_prev - t_curr) * pred
+        i += 1
+    return img
+def unpack(x: Tensor, height: int, width: int) -> Tensor:
+    return rearrange(
+        x,
+        "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+        h=math.ceil(height / 16),
+        w=math.ceil(width / 16),
+        ph=2,
+        pw=2,
+    )

withanyone/flux/util.py ADDED Viewed

	@@ -0,0 +1,518 @@

+import os
+from dataclasses import dataclass
+import torch
+import json
+import numpy as np
+from huggingface_hub import hf_hub_download
+from safetensors import safe_open
+from safetensors.torch import load_file as load_sft
+from withanyone.flux.model import Flux, FluxParams
+from .modules.autoencoder import AutoEncoder, AutoEncoderParams
+from .modules.conditioner import HFEmbedder
+import re
+from withanyone.flux.modules.layers import DoubleStreamBlockLoraProcessor, SingleStreamBlockLoraProcessor
+def c_crop(image):
+    width, height = image.size
+    new_size = min(width, height)
+    left = (width - new_size) / 2
+    top = (height - new_size) / 2
+    right = (width + new_size) / 2
+    bottom = (height + new_size) / 2
+    return image.crop((left, top, right, bottom))
+def pad64(x):
+    return int(np.ceil(float(x) / 64.0) * 64 - x)
+def HWC3(x):
+    assert x.dtype == np.uint8
+    if x.ndim == 2:
+        x = x[:, :, None]
+    assert x.ndim == 3
+    H, W, C = x.shape
+    assert C == 1 or C == 3 or C == 4
+    if C == 3:
+        return x
+    if C == 1:
+        return np.concatenate([x, x, x], axis=2)
+    if C == 4:
+        color = x[:, :, 0:3].astype(np.float32)
+        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+        y = color * alpha + 255.0 * (1.0 - alpha)
+        y = y.clip(0, 255).astype(np.uint8)
+        return y
+@dataclass
+class ModelSpec:
+    params: FluxParams
+    ae_params: AutoEncoderParams
+    repo_id: str | None
+    repo_flow: str | None
+    repo_ae: str | None
+    repo_id_ae: str | None
+configs = {
+    "flux-dev": ModelSpec(
+        repo_id="black-forest-labs/FLUX.1-dev",
+        repo_id_ae="black-forest-labs/FLUX.1-dev",
+        repo_flow="flux1-dev.safetensors",
+        repo_ae="ae.safetensors",
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-dev-fp8": ModelSpec(
+        repo_id="black-forest-labs/FLUX.1-dev",
+        repo_id_ae="black-forest-labs/FLUX.1-dev",
+        repo_flow="flux1-dev.safetensors",
+        repo_ae="ae.safetensors",
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-krea": ModelSpec(
+        repo_id="black-forest-labs/FLUX.1-Krea-dev",
+        repo_id_ae="black-forest-labs/FLUX.1-Krea-dev",
+        repo_flow="flux1-krea-dev.safetensors",
+        repo_ae="ae.safetensors",
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-schnell": ModelSpec(
+        repo_id="black-forest-labs/FLUX.1-schnell",
+        repo_id_ae="black-forest-labs/FLUX.1-dev",
+        repo_flow="flux1-schnell.safetensors",
+        repo_ae="ae.safetensors",
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=False,
+        ),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+}
+def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
+    if len(missing) > 0 and len(unexpected) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        print("\n" + "-" * 79 + "\n")
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+    elif len(missing) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+    elif len(unexpected) > 0:
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+def load_from_repo_id(repo_id, checkpoint_name):
+    ckpt_path = hf_hub_download(repo_id, checkpoint_name)
+    sd = load_sft(ckpt_path, device='cpu')
+    return sd
+def load_flow_model_no_lora(
+    name: str,
+    path: str,
+    ipa_path: str ,
+    device: str | torch.device = "cuda",
+    hf_download: bool = True,
+    lora_rank: int = 16,
+    use_fp8: bool = False
+):
+    # Loading Flux
+    print("Init model")
+    ckpt_path = path
+    if ckpt_path == "black-forest-labs/FLUX.1-dev" or (
+        ckpt_path is None
+        and configs[name].repo_id is not None
+        and configs[name].repo_flow is not None
+        and hf_download
+    ):
+        ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow.replace("sft", "safetensors"))
+        print("Downloading checkpoint from HF:", ckpt_path)
+    else:
+        ckpt_path = os.path.join(path, "flux1-dev.safetensors") if path is not None else None
+    ipa_ckpt_path = ipa_path
+    with torch.device("meta" if ckpt_path is not None else device):
+        model = Flux(configs[name].params)
+    # model = set_lora(model, lora_rank, device="meta" if ipa_ckpt_path is not None else device)
+    if ckpt_path is not None:
+        if ipa_ckpt_path == 'WithAnyone/WithAnyone':
+            ipa_ckpt_path = hf_hub_download("WithAnyone/WithAnyone", "withanyone.safetensors")
+        lora_sd = load_sft(ipa_ckpt_path, device=str(device)) if ipa_ckpt_path.endswith("safetensors")\
+            else torch.load(ipa_ckpt_path, map_location='cpu')
+        print("Loading main checkpoint")
+        # load_sft doesn't support torch.device
+        if ckpt_path.endswith('safetensors'):
+            if use_fp8:
+                print(
+                    "####\n"
+                    "We are in fp8 mode right now, since the fp8 checkpoint of XLabs-AI/flux-dev-fp8 seems broken\n"
+                    "we convert the fp8 checkpoint on flight from bf16 checkpoint\n"
+                    "If your storage is constrained"
+                    "you can save the fp8 checkpoint and replace the bf16 checkpoint by yourself\n"
+                )
+                sd = load_sft(ckpt_path, device="cpu")
+                sd = {k: v.to(dtype=torch.float8_e4m3fn, device=device) for k, v in sd.items()}
+            else:
+                sd = load_sft(ckpt_path, device=str(device))
+            # # Then proceed with the update
+            sd.update(lora_sd)
+            missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+        else:
+            dit_state = torch.load(ckpt_path, map_location='cpu')
+            sd = {}
+            for k in dit_state.keys():
+                sd[k.replace('module.','')] = dit_state[k]
+            sd.update(lora_sd)
+            missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+            model.to(str(device))
+        print_load_warning(missing, unexpected)
+    return model
+def merge_to_flux_model(
+    loading_device, working_device, flux_state_dict, model, ratio, merge_dtype, save_dtype, mem_eff_load_save=False
+):
+    lora_name_to_module_key = {}
+    keys = list(flux_state_dict.keys())
+    for key in keys:
+        if key.endswith(".weight"):
+            module_name = ".".join(key.split(".")[:-1])
+            lora_name = "lora_unet" + "_" + module_name.replace(".", "_")
+            lora_name_to_module_key[lora_name] = key
+    print(f"loading: {model}")
+    lora_sd = load_sft(model, device=loading_device) if model.endswith("safetensors")\
+                else torch.load(model, map_location='cpu')
+    print(f"merging...")
+    for key in list(lora_sd.keys()):
+        if "lora_down" in key:
+            lora_name = key[: key.rfind(".lora_down")]
+            up_key = key.replace("lora_down", "lora_up")
+            alpha_key = key[: key.index("lora_down")] + "alpha"
+            if lora_name not in lora_name_to_module_key:
+                print(f"no module found for LoRA weight: {key}.  LoRA for Text Encoder is not supported yet.")
+                continue
+            down_weight = lora_sd.pop(key)
+            up_weight = lora_sd.pop(up_key)
+            dim = down_weight.size()[0]
+            alpha = lora_sd.pop(alpha_key, dim)
+            scale = alpha / dim
+            # W <- W + U * D
+            module_weight_key = lora_name_to_module_key[lora_name]
+            if module_weight_key not in flux_state_dict:
+                # weight = flux_file.get_tensor(module_weight_key)
+                print(f"no module found for LoRA weight: {module_weight_key}")
+            else:
+                weight = flux_state_dict[module_weight_key]
+            weight = weight.to(working_device, merge_dtype)
+            up_weight = up_weight.to(working_device, merge_dtype)
+            down_weight = down_weight.to(working_device, merge_dtype)
+            if len(weight.size()) == 2:
+                # linear
+                weight = weight + ratio * (up_weight @ down_weight) * scale
+            elif down_weight.size()[2:4] == (1, 1):
+                # conv2d 1x1
+                weight = (
+                    weight
+                    + ratio
+                    * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                    * scale
+                )
+            else:
+                # conv2d 3x3
+                conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                weight = weight + ratio * conved * scale
+            flux_state_dict[module_weight_key] = weight.to(loading_device, save_dtype)
+            del up_weight
+            del down_weight
+            del weight
+    if len(lora_sd) > 0:
+        print(f"Unused keys in LoRA model: {list(lora_sd.keys())}")
+    return flux_state_dict
+def load_flow_model_diffusers(
+    name: str,
+    path: str,
+    ipa_path: str ,
+    device: str | torch.device = "cuda",
+    hf_download: bool = True,
+    lora_rank: int = 16,
+    use_fp8: bool = False,
+    additional_lora_ckpt: str | None = None,
+    lora_weight: float = 1.0,
+):
+    # Loading Flux
+    print("Init model")
+    ckpt_path = os.path.join(path, "flux1-dev.safetensors") if path is not None else None
+    print("Loading checkpoint from", ckpt_path)
+    if (
+        ckpt_path is None
+        and configs[name].repo_id is not None
+        and configs[name].repo_flow is not None
+        and hf_download
+    ):
+        ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow.replace("sft", "safetensors"))
+    ipa_ckpt_path = ipa_path
+    with torch.device("meta" if ckpt_path is not None else device):
+        model = Flux(configs[name].params)
+    # if additional_lora_ckpt is not None:
+    #     model = set_lora(model, lora_rank, device="meta" if ipa_ckpt_path is not None else device)
+    assert additional_lora_ckpt is not None, "additional_lora_ckpt should have been provided. this must be a bug"
+    if ckpt_path is not None:
+        if ipa_ckpt_path == 'WithAnyone/WithAnyone':
+            ipa_ckpt_path = hf_hub_download("WithAnyone/WithAnyone", "withanyone.safetensors")
+        else:
+            lora_sd = load_sft(ipa_ckpt_path, device=str(device)) if ipa_ckpt_path.endswith("safetensors")\
+                else torch.load(ipa_ckpt_path, map_location='cpu')
+        extra_lora_path = additional_lora_ckpt
+        print("Loading main checkpoint")
+        # load_sft doesn't support torch.device
+        if ckpt_path.endswith('safetensors'):
+            if use_fp8:
+                print(
+                    "####\n"
+                    "We are in fp8 mode right now, since the fp8 checkpoint of XLabs-AI/flux-dev-fp8 seems broken\n"
+                    "we convert the fp8 checkpoint on flight from bf16 checkpoint\n"
+                    "If your storage is constrained"
+                    "you can save the fp8 checkpoint and replace the bf16 checkpoint by yourself\n"
+                )
+                sd = load_sft(ckpt_path, device="cpu")
+                sd = {k: v.to(dtype=torch.float8_e4m3fn, device=device) for k, v in sd.items()}
+            else:
+                sd = load_sft(ckpt_path, device=str(device))
+            if extra_lora_path is not None:
+                print("Merging extra lora to main checkpoint")
+                lora_ckpt_path = extra_lora_path
+                sd = merge_to_flux_model("cpu", device, sd, lora_ckpt_path, lora_weight, torch.float8_e4m3fn if use_fp8 else torch.bfloat16, torch.float8_e4m3fn if use_fp8 else torch.bfloat16)
+            # # Then proceed with the update
+            sd.update(ipa_lora_sd)
+            missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+            model.to(str(device))
+        else:
+            dit_state = torch.load(ckpt_path, map_location='cpu')
+            sd = {}
+            for k in dit_state.keys():
+                sd[k.replace('module.','')] = dit_state[k]
+            if extra_lora_path is not None:
+                print("Merging extra lora to main checkpoint")
+                lora_ckpt_path = extra_lora_path
+                sd = merge_to_flux_model("cpu", device, sd, lora_ckpt_path, 1.0, torch.float8_e4m3fn if use_fp8 else torch.bfloat16, torch.float8_e4m3fn if use_fp8 else torch.bfloat16)
+            sd.update(ipa_lora_sd)
+            missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+            model.to(str(device))
+        print_load_warning(missing, unexpected)
+    return model
+def set_lora(
+    model: Flux,
+    lora_rank: int,
+    double_blocks_indices: list[int] | None = None,
+    single_blocks_indices: list[int] | None = None,
+    device: str | torch.device = "cpu",
+) -> Flux:
+    double_blocks_indices = list(range(model.params.depth)) if double_blocks_indices is None else double_blocks_indices
+    single_blocks_indices = list(range(model.params.depth_single_blocks)) if single_blocks_indices is None \
+                            else single_blocks_indices
+    lora_attn_procs = {}
+    with torch.device(device):
+        for name, attn_processor in  model.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("double_blocks") and layer_index in double_blocks_indices:
+                lora_attn_procs[name] = DoubleStreamBlockLoraProcessor(dim=model.params.hidden_size, rank=lora_rank)
+            elif name.startswith("single_blocks") and layer_index in single_blocks_indices:
+                lora_attn_procs[name] = SingleStreamBlockLoraProcessor(dim=model.params.hidden_size, rank=lora_rank)
+            else:
+                lora_attn_procs[name] = attn_processor
+    model.set_attn_processor(lora_attn_procs)
+    return model
+def load_t5(t5_path, device: str | torch.device = "cuda", max_length: int = 512) -> HFEmbedder:
+    # max length 64, 128, 256 and 512 should work (if your sequence is short enough)
+    version = t5_path
+    return HFEmbedder(version, max_length=max_length, torch_dtype=torch.bfloat16).to(device)
+def load_clip(clip_path, device: str | torch.device = "cuda") -> HFEmbedder:
+    version = clip_path
+    return HFEmbedder(version, max_length=77, torch_dtype=torch.bfloat16).to(device)
+def load_ae(flux_path, name: str, device: str | torch.device = "cuda", hf_download: bool = True) -> AutoEncoder:
+    if flux_path == "black-forest-labs/FLUX.1-dev" or flux_path == "black-forest-labs/FLUX.1-schnell" or flux_path == "black-forest-labs/FLUX.1-Krea-dev" or flux_path == "black-forest-labs/FLUX.1-Kontext-dev":
+        ckpt_path = hf_hub_download("black-forest-labs/FLUX.1-dev", "ae.safetensors")
+    else:
+        ckpt_path =  os.path.join(flux_path, "ae.safetensors")
+        if not os.path.exists(ckpt_path):
+            # try diffusion_pytorch_model.safetensors
+            ckpt_path =  os.path.join(flux_path, "vae", "ae.safetensors")
+            if not os.path.exists(ckpt_path):
+                raise FileNotFoundError(f"Cannot find ae checkpoint in {flux_path}/ae.safetensors or {flux_path}/vae/ae.safetensors")
+    # Loading the autoencoder
+    print("Init AE")
+    with torch.device("meta" if ckpt_path is not None else device):
+        ae = AutoEncoder(configs[name].ae_params)
+    # if ckpt_path is not None:
+    assert ckpt_path is not None, "ckpt_path should have been provided. this must be a bug"
+    sd = load_sft(ckpt_path, device=str(device))
+    missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
+    print_load_warning(missing, unexpected)
+    return ae

withanyone/utils/convert_yaml_to_args_file.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import argparse
+import yaml
+parser = argparse.ArgumentParser()
+parser.add_argument("--yaml", type=str, required=True)
+parser.add_argument("--arg", type=str, required=True)
+args = parser.parse_args()
+with open(args.yaml, "r") as f:
+    data = yaml.safe_load(f)
+with open(args.arg, "w") as f:
+    for k, v in data.items():
+        if isinstance(v, list):
+            v = list(map(str, v))
+            v = " ".join(v)
+        if v is None:
+            continue
+        print(f"--{k} {v}", end=" ", file=f)