Upload folder using huggingface_hub
Browse files- .gitattributes +7 -0
- README.md +78 -0
- README_from_modelscope.md +107 -0
- assets/samples/eligen_example_1.png +3 -0
- assets/samples/eligen_example_1_mask.png +0 -0
- assets/samples/eligen_example_2.png +3 -0
- assets/samples/eligen_example_2_mask.png +0 -0
- assets/samples/eligen_example_3.png +3 -0
- assets/samples/eligen_example_3_mask.png +0 -0
- assets/samples/eligen_example_4.png +3 -0
- assets/samples/eligen_example_4_mask.png +0 -0
- assets/samples/eligen_example_5.png +3 -0
- assets/samples/eligen_example_5_mask.png +0 -0
- assets/samples/poster.png +3 -0
- assets/samples/poster_region.png +0 -0
- assets/title.png +3 -0
- configuration.json +1 -0
- model.safetensors +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/samples/eligen_example_1.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/samples/eligen_example_2.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/samples/eligen_example_3.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
assets/samples/eligen_example_4.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
assets/samples/eligen_example_5.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
assets/samples/poster.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
assets/title.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
| 4 |
+
# Qwen-Image Precise Region Control Model
|
| 5 |
+
|
| 6 |
+

|
| 7 |
+
|
| 8 |
+
## Model Introduction
|
| 9 |
+
|
| 10 |
+
This model is a precise region control model trained based on [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image), with a LoRA architecture. It enables control over the position and shape of each entity by taking as input both textual descriptions and regional conditions (mask maps) for each entity. The training framework is built on [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio), using the dataset [DiffSynth-Studio/EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet).
|
| 11 |
+
|
| 12 |
+
## Results Demonstration
|
| 13 |
+
|
| 14 |
+
|Entity Control Condition|Generated Image|
|
| 15 |
+
|-|-|
|
| 16 |
+
|||
|
| 17 |
+
|||
|
| 18 |
+
|||
|
| 19 |
+
|||
|
| 20 |
+
|||
|
| 21 |
+
|||
|
| 22 |
+
|
| 23 |
+
## Inference Code
|
| 24 |
+
```
|
| 25 |
+
git clone https://github.com/modelscope/DiffSynth-Studio.git
|
| 26 |
+
cd DiffSynth-Studio
|
| 27 |
+
pip install -e .
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
```python
|
| 31 |
+
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
|
| 32 |
+
from modelscope import dataset_snapshot_download, snapshot_download
|
| 33 |
+
import torch
|
| 34 |
+
from PIL import Image
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
```python
|
| 38 |
+
pipe = QwenImagePipeline.from_pretrained(
|
| 39 |
+
torch_dtype=torch.bfloat16,
|
| 40 |
+
device="cuda",
|
| 41 |
+
model_configs=[
|
| 42 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
|
| 43 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
|
| 44 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
| 45 |
+
],
|
| 46 |
+
tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
|
| 47 |
+
)
|
| 48 |
+
snapshot_download("DiffSynth-Studio/Qwen-Image-EliGen", local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen", allow_file_pattern="model.safetensors")
|
| 49 |
+
pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen/model.safetensors")
|
| 50 |
+
|
| 51 |
+
global_prompt = "Poster for the Qwen-Image-EliGen Magic Café, featuring two magical coffees—one emitting flames and the other emitting ice spikes—against a light blue misty background, with text reading 'Qwen-Image-EliGen Magic Café' and 'New Arrival'"
|
| 52 |
+
entity_prompts = ["A red magic coffee with flames rising from the cup",
|
| 53 |
+
"A red magic coffee surrounded by ice spikes",
|
| 54 |
+
"Text: 'New Arrival'",
|
| 55 |
+
"Text: 'Qwen-Image-EliGen Magic Café'"]
|
| 56 |
+
|
| 57 |
+
dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/qwen-image/example_6/*.png")
|
| 58 |
+
masks = [Image.open(f"./data/examples/eligen/qwen-image/example_6/{i}.png").convert('RGB').resize((1328, 1328)) for i in range(len(entity_prompts))]
|
| 59 |
+
|
| 60 |
+
image = pipe(
|
| 61 |
+
prompt=global_prompt,
|
| 62 |
+
seed=0,
|
| 63 |
+
eligen_entity_prompts=entity_prompts,
|
| 64 |
+
eligen_entity_masks=masks,
|
| 65 |
+
)
|
| 66 |
+
image.save("image.jpg")
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## Citation
|
| 70 |
+
If you find our work helpful, please consider citing our research:
|
| 71 |
+
```
|
| 72 |
+
@article{zhang2025eligen,
|
| 73 |
+
title={Eligen: Entity-level Controlled Image Generation with Regional Attention},
|
| 74 |
+
author={Zhang, Hong and Duan, Zhongjie and Wang, Xingjun and Chen, Yingda and Zhang, Yu},
|
| 75 |
+
journal={arXiv preprint arXiv:2501.01097},
|
| 76 |
+
year={2025}
|
| 77 |
+
}
|
| 78 |
+
```
|
README_from_modelscope.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
frameworks:
|
| 3 |
+
- Pytorch
|
| 4 |
+
license: Apache License 2.0
|
| 5 |
+
tasks:
|
| 6 |
+
- text-to-image-synthesis
|
| 7 |
+
|
| 8 |
+
#model-type:
|
| 9 |
+
##如 gpt、phi、llama、chatglm、baichuan 等
|
| 10 |
+
#- gpt
|
| 11 |
+
|
| 12 |
+
#domain:
|
| 13 |
+
##如 nlp、cv、audio、multi-modal
|
| 14 |
+
#- nlp
|
| 15 |
+
|
| 16 |
+
#language:
|
| 17 |
+
##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
|
| 18 |
+
#- cn
|
| 19 |
+
|
| 20 |
+
#metrics:
|
| 21 |
+
##如 CIDEr、Blue、ROUGE 等
|
| 22 |
+
#- CIDEr
|
| 23 |
+
|
| 24 |
+
#tags:
|
| 25 |
+
##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
|
| 26 |
+
#- pretrained
|
| 27 |
+
|
| 28 |
+
#tools:
|
| 29 |
+
##如 vllm、fastchat、llamacpp、AdaSeq 等
|
| 30 |
+
#- vllm
|
| 31 |
+
base_model:
|
| 32 |
+
- Qwen/Qwen-Image
|
| 33 |
+
base_model_relation: adapter
|
| 34 |
+
new_version: DiffSynth-Studio/Qwen-Image-EliGen-V2
|
| 35 |
+
---
|
| 36 |
+
# Qwen-Image 精确分区控制模型
|
| 37 |
+
|
| 38 |
+

|
| 39 |
+
|
| 40 |
+
## 模型介绍
|
| 41 |
+
|
| 42 |
+
本模型是基于 [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image) 训练的精确分区控制模型,模型结构为 LoRA,可以通过输入每个实体的文本和区域条件(蒙版图)来控制每个实体的位置和形状。训练框架基于 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) 构建,采用的数据集是 [DiffSynth-Studio/EliGenTrainSet](https://www.modelscope.cn/datasets/DiffSynth-Studio/EliGenTrainSet)。
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
## 效果展示
|
| 46 |
+
|
| 47 |
+
|实体控制条件|生成图|
|
| 48 |
+
|-|-|
|
| 49 |
+
|||
|
| 50 |
+
|||
|
| 51 |
+
|||
|
| 52 |
+
|||
|
| 53 |
+
|||
|
| 54 |
+
|||
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
## 推理代码
|
| 58 |
+
```
|
| 59 |
+
git clone https://github.com/modelscope/DiffSynth-Studio.git
|
| 60 |
+
cd DiffSynth-Studio
|
| 61 |
+
pip install -e .
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
from diffsynth.pipelines.qwen_image import QwenImagePipeline, ModelConfig
|
| 66 |
+
from modelscope import dataset_snapshot_download, snapshot_download
|
| 67 |
+
import torch
|
| 68 |
+
from PIL import Image
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
pipe = QwenImagePipeline.from_pretrained(
|
| 72 |
+
torch_dtype=torch.bfloat16,
|
| 73 |
+
device="cuda",
|
| 74 |
+
model_configs=[
|
| 75 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"),
|
| 76 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors"),
|
| 77 |
+
ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
|
| 78 |
+
],
|
| 79 |
+
tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
|
| 80 |
+
)
|
| 81 |
+
snapshot_download("DiffSynth-Studio/Qwen-Image-EliGen", local_dir="models/DiffSynth-Studio/Qwen-Image-EliGen", allow_file_pattern="model.safetensors")
|
| 82 |
+
pipe.load_lora(pipe.dit, "models/DiffSynth-Studio/Qwen-Image-EliGen/model.safetensors")
|
| 83 |
+
|
| 84 |
+
global_prompt = "Qwen-Image-EliGen魔法咖啡厅的宣传海报,主体是两杯魔法咖啡,一杯冒着火焰,一杯冒着冰锥,背景是浅蓝色水雾,海报写着“Qwen-Image-EliGen魔法咖啡厅”、“新品上市”"
|
| 85 |
+
entity_prompts = ["一杯红色魔法咖啡,杯中火焰燃烧", "一杯红色魔法咖啡,杯中冰锥环绕", "字:“新品上市”", "字:“Qwen-Image-EliGen魔法咖啡厅”"]
|
| 86 |
+
|
| 87 |
+
dataset_snapshot_download(dataset_id="DiffSynth-Studio/examples_in_diffsynth", local_dir="./", allow_file_pattern=f"data/examples/eligen/qwen-image/example_6/*.png")
|
| 88 |
+
masks = [Image.open(f"./data/examples/eligen/qwen-image/example_6/{i}.png").convert('RGB').resize((1328, 1328)) for i in range(len(entity_prompts))]
|
| 89 |
+
|
| 90 |
+
image = pipe(
|
| 91 |
+
prompt=global_prompt,
|
| 92 |
+
seed=0,
|
| 93 |
+
eligen_entity_prompts=entity_prompts,
|
| 94 |
+
eligen_entity_masks=masks,
|
| 95 |
+
)
|
| 96 |
+
image.save("image.jpg")
|
| 97 |
+
```
|
| 98 |
+
## 引用
|
| 99 |
+
如果您觉得我们的工作对您有所帮助,欢迎引用我们的成果。
|
| 100 |
+
```
|
| 101 |
+
@article{zhang2025eligen,
|
| 102 |
+
title={Eligen: Entity-level controlled image generation with regional attention},
|
| 103 |
+
author={Zhang, Hong and Duan, Zhongjie and Wang, Xingjun and Chen, Yingda and Zhang, Yu},
|
| 104 |
+
journal={arXiv preprint arXiv:2501.01097},
|
| 105 |
+
year={2025}
|
| 106 |
+
}
|
| 107 |
+
```
|
assets/samples/eligen_example_1.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_1_mask.png
ADDED
|
assets/samples/eligen_example_2.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_2_mask.png
ADDED
|
assets/samples/eligen_example_3.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_3_mask.png
ADDED
|
assets/samples/eligen_example_4.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_4_mask.png
ADDED
|
assets/samples/eligen_example_5.png
ADDED
|
Git LFS Details
|
assets/samples/eligen_example_5_mask.png
ADDED
|
assets/samples/poster.png
ADDED
|
Git LFS Details
|
assets/samples/poster_region.png
ADDED
|
assets/title.png
ADDED
|
Git LFS Details
|
configuration.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"framework":"Pytorch","task":"text-to-image-synthesis"}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ac6571af480d75fc344fe8faf4b8f655e8c7809c473aaea439af6f6c8957a3b
|
| 3 |
+
size 472047184
|