Spaces:

Allex21
/

TrainL

Configuration error

App Files Files Community

Allex21 commited on Sep 18

Commit

c11c83e

verified ·

1 Parent(s): eac965b

Upload 89 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
FUNDING.yml +3 -0
__init__.py +0 -0
adafactor_fused.py +106 -0
attention.py +119 -0
attention_processors.py +227 -0
blip.py +245 -0
cache_latents.py +205 -0
cache_text_encoder_outputs.py +197 -0
canny.py +34 -0
cextension.py +54 -0
check_lora_weights.py +48 -0
clean_captions_and_tags.py +194 -0
config_README-en.md +386 -0
config_README-ja.md +388 -0
config_util.py +721 -0
control_net_lllite.py +449 -0
control_net_lllite_for_train.py +501 -0
convert_diffusers20_original_sd.py +163 -0
custom_train_functions.py +559 -0
deepspeed_utils.py +139 -0
dependabot.yml +7 -0
detect_face_rotate.py +253 -0
device_utils.py +89 -0
diffusers.py +47 -0
dylora.py +529 -0
extract_lora_from_dylora.py +128 -0
extract_lora_from_models.py +360 -0
fine_tune_README_ja.md +140 -0
gen_img_README-ja.md +487 -0
gradscaler.py +183 -0
hijacks.py +367 -0
huggingface_util.py +84 -0
hypernetwork.py +223 -0
hypernetwork_nai.py +96 -0
latent_upscaler.py +354 -0
libbitsandbytes_cpu.dll +0 -0
libbitsandbytes_cuda116.dll +3 -0
libbitsandbytes_cuda118.dll +3 -0
logo_aihub.png +0 -0
lora.py +1410 -0
lora_diffusers.py +616 -0
lora_fa.py +1244 -0
lora_interrogator.py +146 -0
lpw_stable_diffusion.py +1233 -0
main.py +166 -0
make_captions.py +210 -0
make_captions_by_git.py +183 -0
masked_loss_README-ja.md +57 -0
masked_loss_README.md +56 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+libbitsandbytes_cuda116.dll filter=lfs diff=lfs merge=lfs -text
+libbitsandbytes_cuda118.dll filter=lfs diff=lfs merge=lfs -text

FUNDING.yml ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # These are supported funding model platforms
2	+
3	+ github: kohya-ss

__init__.py ADDED Viewed

File without changes

adafactor_fused.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import math
+import torch
+from transformers import Adafactor
+@torch.no_grad()
+def adafactor_step_param(self, p, group):
+    if p.grad is None:
+        return
+    grad = p.grad
+    if grad.dtype in {torch.float16, torch.bfloat16}:
+        grad = grad.float()
+    if grad.is_sparse:
+        raise RuntimeError("Adafactor does not support sparse gradients.")
+    state = self.state[p]
+    grad_shape = grad.shape
+    factored, use_first_moment = Adafactor._get_options(group, grad_shape)
+    # State Initialization
+    if len(state) == 0:
+        state["step"] = 0
+        if use_first_moment:
+            # Exponential moving average of gradient values
+            state["exp_avg"] = torch.zeros_like(grad)
+        if factored:
+            state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad)
+            state["exp_avg_sq_col"] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
+        else:
+            state["exp_avg_sq"] = torch.zeros_like(grad)
+        state["RMS"] = 0
+    else:
+        if use_first_moment:
+            state["exp_avg"] = state["exp_avg"].to(grad)
+        if factored:
+            state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad)
+            state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad)
+        else:
+            state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
+    p_data_fp32 = p
+    if p.dtype in {torch.float16, torch.bfloat16}:
+        p_data_fp32 = p_data_fp32.float()
+    state["step"] += 1
+    state["RMS"] = Adafactor._rms(p_data_fp32)
+    lr = Adafactor._get_lr(group, state)
+    beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
+    update = (grad ** 2) + group["eps"][0]
+    if factored:
+        exp_avg_sq_row = state["exp_avg_sq_row"]
+        exp_avg_sq_col = state["exp_avg_sq_col"]
+        exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=(1.0 - beta2t))
+        exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=(1.0 - beta2t))
+        # Approximation of exponential moving average of square of gradient
+        update = Adafactor._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
+        update.mul_(grad)
+    else:
+        exp_avg_sq = state["exp_avg_sq"]
+        exp_avg_sq.mul_(beta2t).add_(update, alpha=(1.0 - beta2t))
+        update = exp_avg_sq.rsqrt().mul_(grad)
+    update.div_((Adafactor._rms(update) / group["clip_threshold"]).clamp_(min=1.0))
+    update.mul_(lr)
+    if use_first_moment:
+        exp_avg = state["exp_avg"]
+        exp_avg.mul_(group["beta1"]).add_(update, alpha=(1 - group["beta1"]))
+        update = exp_avg
+    if group["weight_decay"] != 0:
+        p_data_fp32.add_(p_data_fp32, alpha=(-group["weight_decay"] * lr))
+    p_data_fp32.add_(-update)
+    if p.dtype in {torch.float16, torch.bfloat16}:
+        p.copy_(p_data_fp32)
+@torch.no_grad()
+def adafactor_step(self, closure=None):
+    """
+    Performs a single optimization step
+    Arguments:
+        closure (callable, optional): A closure that reevaluates the model
+            and returns the loss.
+    """
+    loss = None
+    if closure is not None:
+        loss = closure()
+    for group in self.param_groups:
+        for p in group["params"]:
+            adafactor_step_param(self, p, group)
+    return loss
+def patch_adafactor_fused(optimizer: Adafactor):
+    optimizer.step_param = adafactor_step_param.__get__(optimizer)
+    optimizer.step = adafactor_step.__get__(optimizer)

attention.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+import torch
+from functools import cache, wraps
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+# ARC GPUs can't allocate more than 4GB to a single block so we slice the attention layers
+sdpa_slice_trigger_rate = float(os.environ.get('IPEX_SDPA_SLICE_TRIGGER_RATE', 1))
+attention_slice_rate = float(os.environ.get('IPEX_ATTENTION_SLICE_RATE', 0.5))
+# Find something divisible with the input_tokens
+@cache
+def find_split_size(original_size, slice_block_size, slice_rate=2):
+    split_size = original_size
+    while True:
+        if (split_size * slice_block_size) <= slice_rate and original_size % split_size == 0:
+            return split_size
+        split_size = split_size - 1
+        if split_size <= 1:
+            return 1
+    return split_size
+# Find slice sizes for SDPA
+@cache
+def find_sdpa_slice_sizes(query_shape, key_shape, query_element_size, slice_rate=2, trigger_rate=3):
+    batch_size, attn_heads, query_len, _ = query_shape
+    _, _, key_len, _ = key_shape
+    slice_batch_size = attn_heads * (query_len * key_len) * query_element_size / 1024 / 1024 / 1024
+    split_batch_size = batch_size
+    split_head_size = attn_heads
+    split_query_size = query_len
+    do_batch_split = False
+    do_head_split = False
+    do_query_split = False
+    if batch_size * slice_batch_size >= trigger_rate:
+        do_batch_split = True
+        split_batch_size = find_split_size(batch_size, slice_batch_size, slice_rate=slice_rate)
+        if split_batch_size * slice_batch_size > slice_rate:
+            slice_head_size = split_batch_size * (query_len * key_len) * query_element_size / 1024 / 1024 / 1024
+            do_head_split = True
+            split_head_size = find_split_size(attn_heads, slice_head_size, slice_rate=slice_rate)
+            if split_head_size * slice_head_size > slice_rate:
+                slice_query_size = split_batch_size * split_head_size * (key_len) * query_element_size / 1024 / 1024 / 1024
+                do_query_split = True
+                split_query_size = find_split_size(query_len, slice_query_size, slice_rate=slice_rate)
+    return do_batch_split, do_head_split, do_query_split, split_batch_size, split_head_size, split_query_size
+original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
+@wraps(torch.nn.functional.scaled_dot_product_attention)
+def dynamic_scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, **kwargs):
+    if query.device.type != "xpu":
+        return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal, **kwargs)
+    is_unsqueezed = False
+    if len(query.shape) == 3:
+        query = query.unsqueeze(0)
+        is_unsqueezed = True
+    if len(key.shape) == 3:
+        key = key.unsqueeze(0)
+    if len(value.shape) == 3:
+        value = value.unsqueeze(0)
+    do_batch_split, do_head_split, do_query_split, split_batch_size, split_head_size, split_query_size = find_sdpa_slice_sizes(query.shape, key.shape, query.element_size(), slice_rate=attention_slice_rate, trigger_rate=sdpa_slice_trigger_rate)
+    # Slice SDPA
+    if do_batch_split:
+        batch_size, attn_heads, query_len, _ = query.shape
+        _, _, _, head_dim = value.shape
+        hidden_states = torch.zeros((batch_size, attn_heads, query_len, head_dim), device=query.device, dtype=query.dtype)
+        if attn_mask is not None:
+            attn_mask = attn_mask.expand((query.shape[0], query.shape[1], query.shape[2], key.shape[-2]))
+        for ib in range(batch_size // split_batch_size):
+            start_idx = ib * split_batch_size
+            end_idx = (ib + 1) * split_batch_size
+            if do_head_split:
+                for ih in range(attn_heads // split_head_size): # pylint: disable=invalid-name
+                    start_idx_h = ih * split_head_size
+                    end_idx_h = (ih + 1) * split_head_size
+                    if do_query_split:
+                        for iq in range(query_len // split_query_size): # pylint: disable=invalid-name
+                            start_idx_q = iq * split_query_size
+                            end_idx_q = (iq + 1) * split_query_size
+                            hidden_states[start_idx:end_idx, start_idx_h:end_idx_h, start_idx_q:end_idx_q, :] = original_scaled_dot_product_attention(
+                                query[start_idx:end_idx, start_idx_h:end_idx_h, start_idx_q:end_idx_q, :],
+                                key[start_idx:end_idx, start_idx_h:end_idx_h, :, :],
+                                value[start_idx:end_idx, start_idx_h:end_idx_h, :, :],
+                                attn_mask=attn_mask[start_idx:end_idx, start_idx_h:end_idx_h, start_idx_q:end_idx_q, :] if attn_mask is not None else attn_mask,
+                                dropout_p=dropout_p, is_causal=is_causal, **kwargs
+                            )
+                    else:
+                        hidden_states[start_idx:end_idx, start_idx_h:end_idx_h, :, :] = original_scaled_dot_product_attention(
+                            query[start_idx:end_idx, start_idx_h:end_idx_h, :, :],
+                            key[start_idx:end_idx, start_idx_h:end_idx_h, :, :],
+                            value[start_idx:end_idx, start_idx_h:end_idx_h, :, :],
+                            attn_mask=attn_mask[start_idx:end_idx, start_idx_h:end_idx_h, :, :] if attn_mask is not None else attn_mask,
+                            dropout_p=dropout_p, is_causal=is_causal, **kwargs
+                        )
+            else:
+                hidden_states[start_idx:end_idx, :, :, :] = original_scaled_dot_product_attention(
+                    query[start_idx:end_idx, :, :, :],
+                    key[start_idx:end_idx, :, :, :],
+                    value[start_idx:end_idx, :, :, :],
+                    attn_mask=attn_mask[start_idx:end_idx, :, :, :] if attn_mask is not None else attn_mask,
+                    dropout_p=dropout_p, is_causal=is_causal, **kwargs
+                )
+        torch.xpu.synchronize(query.device)
+    else:
+        hidden_states = original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal, **kwargs)
+    if is_unsqueezed:
+        hidden_states.squeeze(0)
+    return hidden_states

attention_processors.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import math
+from typing import Any
+from einops import rearrange
+import torch
+from diffusers.models.attention_processor import Attention
+# flash attention forwards and backwards
+# https://arxiv.org/abs/2205.14135
+EPSILON = 1e-6
+class FlashAttentionFunction(torch.autograd.function.Function):
+    @staticmethod
+    @torch.no_grad()
+    def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size):
+        """Algorithm 2 in the paper"""
+        device = q.device
+        dtype = q.dtype
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        o = torch.zeros_like(q)
+        all_row_sums = torch.zeros((*q.shape[:-1], 1), dtype=dtype, device=device)
+        all_row_maxes = torch.full(
+            (*q.shape[:-1], 1), max_neg_value, dtype=dtype, device=device
+        )
+        scale = q.shape[-1] ** -0.5
+        if mask is None:
+            mask = (None,) * math.ceil(q.shape[-2] / q_bucket_size)
+        else:
+            mask = rearrange(mask, "b n -> b 1 1 n")
+            mask = mask.split(q_bucket_size, dim=-1)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            mask,
+            all_row_sums.split(q_bucket_size, dim=-2),
+            all_row_maxes.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, row_mask, row_sums, row_maxes) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = (
+                    torch.einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                )
+                if row_mask is not None:
+                    attn_weights.masked_fill_(~row_mask, max_neg_value)
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones(
+                        (qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device
+                    ).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                block_row_maxes = attn_weights.amax(dim=-1, keepdims=True)
+                attn_weights -= block_row_maxes
+                exp_weights = torch.exp(attn_weights)
+                if row_mask is not None:
+                    exp_weights.masked_fill_(~row_mask, 0.0)
+                block_row_sums = exp_weights.sum(dim=-1, keepdims=True).clamp(
+                    min=EPSILON
+                )
+                new_row_maxes = torch.maximum(block_row_maxes, row_maxes)
+                exp_values = torch.einsum(
+                    "... i j, ... j d -> ... i d", exp_weights, vc
+                )
+                exp_row_max_diff = torch.exp(row_maxes - new_row_maxes)
+                exp_block_row_max_diff = torch.exp(block_row_maxes - new_row_maxes)
+                new_row_sums = (
+                    exp_row_max_diff * row_sums
+                    + exp_block_row_max_diff * block_row_sums
+                )
+                oc.mul_((row_sums / new_row_sums) * exp_row_max_diff).add_(
+                    (exp_block_row_max_diff / new_row_sums) * exp_values
+                )
+                row_maxes.copy_(new_row_maxes)
+                row_sums.copy_(new_row_sums)
+        ctx.args = (causal, scale, mask, q_bucket_size, k_bucket_size)
+        ctx.save_for_backward(q, k, v, o, all_row_sums, all_row_maxes)
+        return o
+    @staticmethod
+    @torch.no_grad()
+    def backward(ctx, do):
+        """Algorithm 4 in the paper"""
+        causal, scale, mask, q_bucket_size, k_bucket_size = ctx.args
+        q, k, v, o, l, m = ctx.saved_tensors
+        device = q.device
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        dq = torch.zeros_like(q)
+        dk = torch.zeros_like(k)
+        dv = torch.zeros_like(v)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            do.split(q_bucket_size, dim=-2),
+            mask,
+            l.split(q_bucket_size, dim=-2),
+            m.split(q_bucket_size, dim=-2),
+            dq.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, doc, row_mask, lc, mc, dqc) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+                dk.split(k_bucket_size, dim=-2),
+                dv.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc, dkc, dvc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = (
+                    torch.einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                )
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones(
+                        (qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device
+                    ).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                exp_attn_weights = torch.exp(attn_weights - mc)
+                if row_mask is not None:
+                    exp_attn_weights.masked_fill_(~row_mask, 0.0)
+                p = exp_attn_weights / lc
+                dv_chunk = torch.einsum("... i j, ... i d -> ... j d", p, doc)
+                dp = torch.einsum("... i d, ... j d -> ... i j", doc, vc)
+                D = (doc * oc).sum(dim=-1, keepdims=True)
+                ds = p * scale * (dp - D)
+                dq_chunk = torch.einsum("... i j, ... j d -> ... i d", ds, kc)
+                dk_chunk = torch.einsum("... i j, ... i d -> ... j d", ds, qc)
+                dqc.add_(dq_chunk)
+                dkc.add_(dk_chunk)
+                dvc.add_(dv_chunk)
+        return dq, dk, dv, None, None, None, None
+class FlashAttnProcessor:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ) -> Any:
+        q_bucket_size = 512
+        k_bucket_size = 1024
+        h = attn.heads
+        q = attn.to_q(hidden_states)
+        encoder_hidden_states = (
+            encoder_hidden_states
+            if encoder_hidden_states is not None
+            else hidden_states
+        )
+        encoder_hidden_states = encoder_hidden_states.to(hidden_states.dtype)
+        if hasattr(attn, "hypernetwork") and attn.hypernetwork is not None:
+            context_k, context_v = attn.hypernetwork.forward(
+                hidden_states, encoder_hidden_states
+            )
+            context_k = context_k.to(hidden_states.dtype)
+            context_v = context_v.to(hidden_states.dtype)
+        else:
+            context_k = encoder_hidden_states
+            context_v = encoder_hidden_states
+        k = attn.to_k(context_k)
+        v = attn.to_v(context_v)
+        del encoder_hidden_states, hidden_states
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+        out = FlashAttentionFunction.apply(
+            q, k, v, attention_mask, False, q_bucket_size, k_bucket_size
+        )
+        out = rearrange(out, "b h n d -> b n (h d)")
+        out = attn.to_out[0](out)
+        out = attn.to_out[1](out)
+        return out

blip.py ADDED Viewed

	@@ -0,0 +1,245 @@

+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+'''
+import warnings
+warnings.filterwarnings("ignore")
+# from models.vit import VisionTransformer, interpolate_pos_embed
+# from models.med import BertConfig, BertModel, BertLMHeadModel
+from blip.vit import VisionTransformer, interpolate_pos_embed
+from blip.med import BertConfig, BertModel, BertLMHeadModel
+from transformers import BertTokenizer
+import torch
+from torch import nn
+import torch.nn.functional as F
+import os
+from urllib.parse import urlparse
+from timm.models.hub import download_cached_file
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+class BLIP_Base(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 224,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
+        self.tokenizer = init_tokenizer()
+        med_config = BertConfig.from_json_file(med_config)
+        med_config.encoder_width = vision_width
+        self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
+    def forward(self, image, caption, mode):
+        assert mode in ['image', 'text', 'multimodal'], "mode parameter must be image, text, or multimodal"
+        text = self.tokenizer(caption, return_tensors="pt").to(image.device)
+        if mode=='image':
+            # return image features
+            image_embeds = self.visual_encoder(image)
+            return image_embeds
+        elif mode=='text':
+            # return text features
+            text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
+                                            return_dict = True, mode = 'text')
+            return text_output.last_hidden_state
+        elif mode=='multimodal':
+            # return multimodel features
+            image_embeds = self.visual_encoder(image)
+            image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+            text.input_ids[:,0] = self.tokenizer.enc_token_id
+            output = self.text_encoder(text.input_ids,
+                                       attention_mask = text.attention_mask,
+                                       encoder_hidden_states = image_embeds,
+                                       encoder_attention_mask = image_atts,
+                                       return_dict = True,
+                                      )
+            return output.last_hidden_state
+class BLIP_Decoder(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 384,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 prompt = 'a picture of ',
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
+        self.tokenizer = init_tokenizer()
+        med_config = BertConfig.from_json_file(med_config)
+        med_config.encoder_width = vision_width
+        self.text_decoder = BertLMHeadModel(config=med_config)
+        self.prompt = prompt
+        self.prompt_length = len(self.tokenizer(self.prompt).input_ids)-1
+    def forward(self, image, caption):
+        image_embeds = self.visual_encoder(image)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        text = self.tokenizer(caption, padding='longest', truncation=True, max_length=40, return_tensors="pt").to(image.device)
+        text.input_ids[:,0] = self.tokenizer.bos_token_id
+        decoder_targets = text.input_ids.masked_fill(text.input_ids == self.tokenizer.pad_token_id, -100)
+        decoder_targets[:,:self.prompt_length] = -100
+        decoder_output = self.text_decoder(text.input_ids,
+                                           attention_mask = text.attention_mask,
+                                           encoder_hidden_states = image_embeds,
+                                           encoder_attention_mask = image_atts,
+                                           labels = decoder_targets,
+                                           return_dict = True,
+                                          )
+        loss_lm = decoder_output.loss
+        return loss_lm
+    def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0):
+        image_embeds = self.visual_encoder(image)
+        # recent version of transformers seems to do repeat_interleave automatically
+        # if not sample:
+        #     image_embeds = image_embeds.repeat_interleave(num_beams,dim=0)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        model_kwargs = {"encoder_hidden_states": image_embeds, "encoder_attention_mask":image_atts}
+        prompt = [self.prompt] * image.size(0)
+        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(image.device)
+        input_ids[:,0] = self.tokenizer.bos_token_id
+        input_ids = input_ids[:, :-1]
+        if sample:
+            #nucleus sampling
+            outputs = self.text_decoder.generate(input_ids=input_ids,
+                                                  max_length=max_length,
+                                                  min_length=min_length,
+                                                  do_sample=True,
+                                                  top_p=top_p,
+                                                  num_return_sequences=1,
+                                                  eos_token_id=self.tokenizer.sep_token_id,
+                                                  pad_token_id=self.tokenizer.pad_token_id,
+                                                  repetition_penalty=1.1,
+                                                  **model_kwargs)
+        else:
+            #beam search
+            outputs = self.text_decoder.generate(input_ids=input_ids,
+                                                  max_length=max_length,
+                                                  min_length=min_length,
+                                                  num_beams=num_beams,
+                                                  eos_token_id=self.tokenizer.sep_token_id,
+                                                  pad_token_id=self.tokenizer.pad_token_id,
+                                                  repetition_penalty=repetition_penalty,
+                                                  **model_kwargs)
+        captions = []
+        for output in outputs:
+            caption = self.tokenizer.decode(output, skip_special_tokens=True)
+            captions.append(caption[len(self.prompt):])
+        return captions
+def blip_decoder(pretrained='',**kwargs):
+    model = BLIP_Decoder(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+        assert(len(msg.missing_keys)==0)
+    return model
+def blip_feature_extractor(pretrained='',**kwargs):
+    model = BLIP_Base(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+        assert(len(msg.missing_keys)==0)
+    return model
+def init_tokenizer():
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    tokenizer.add_special_tokens({'bos_token':'[DEC]'})
+    tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']})
+    tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
+    return tokenizer
+def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0):
+    assert vit in ['base', 'large'], "vit parameter must be base or large"
+    if vit=='base':
+        vision_width = 768
+        visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=12,
+                                           num_heads=12, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
+                                           drop_path_rate=0 or drop_path_rate
+                                          )
+    elif vit=='large':
+        vision_width = 1024
+        visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=24,
+                                           num_heads=16, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
+                                           drop_path_rate=0.1 or drop_path_rate
+                                          )
+    return visual_encoder, vision_width
+def is_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+def load_checkpoint(model,url_or_filename):
+    if is_url(url_or_filename):
+        cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
+        checkpoint = torch.load(cached_file, map_location='cpu')
+    elif os.path.isfile(url_or_filename):
+        checkpoint = torch.load(url_or_filename, map_location='cpu')
+    else:
+        raise RuntimeError('checkpoint url or path is invalid')
+    state_dict = checkpoint['model']
+    state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
+    if 'visual_encoder_m.pos_embed' in model.state_dict().keys():
+        state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],
+                                                                         model.visual_encoder_m)
+    for key in model.state_dict().keys():
+        if key in state_dict.keys():
+            if state_dict[key].shape!=model.state_dict()[key].shape:
+                del state_dict[key]
+    msg = model.load_state_dict(state_dict,strict=False)
+    logger.info('load checkpoint from %s'%url_or_filename)
+    return model,msg

cache_latents.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# latentsのdiskへの事前キャッシュを行う / cache latents to disk
+import argparse
+import math
+from multiprocessing import Value
+import os
+from accelerate.utils import set_seed
+import torch
+from tqdm import tqdm
+from library import config_util
+from library import train_util
+from library import sdxl_train_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+from library.utils import setup_logging, add_logging_arguments
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def cache_to_disk(args: argparse.Namespace) -> None:
+    setup_logging(args, reset=True)
+    train_util.prepare_dataset_args(args, True)
+    # check cache latents arg
+    assert args.cache_latents_to_disk, "cache_latents_to_disk must be True / cache_latents_to_diskはTrueである必要があります"
+    use_dreambooth_method = args.in_json is None
+    if args.seed is not None:
+        set_seed(args.seed)  # 乱数系列を初期化する
+    # tokenizerを準備する：datasetを動かすために必要
+    if args.sdxl:
+        tokenizer1, tokenizer2 = sdxl_train_util.load_tokenizers(args)
+        tokenizers = [tokenizer1, tokenizer2]
+    else:
+        tokenizer = train_util.load_tokenizer(args)
+        tokenizers = [tokenizer]
+    # データセットを準備する
+    if args.dataset_class is None:
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, False, True))
+        if args.dataset_config is not None:
+            logger.info(f"Load dataset config from {args.dataset_config}")
+            user_config = config_util.load_user_config(args.dataset_config)
+            ignored = ["train_data_dir", "in_json"]
+            if any(getattr(args, attr) is not None for attr in ignored):
+                logger.warning(
+                    "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                        ", ".join(ignored)
+                    )
+                )
+        else:
+            if use_dreambooth_method:
+                logger.info("Using DreamBooth method.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
+                                args.train_data_dir, args.reg_data_dir
+                            )
+                        }
+                    ]
+                }
+            else:
+                logger.info("Training with captions.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": [
+                                {
+                                    "image_dir": args.train_data_dir,
+                                    "metadata_file": args.in_json,
+                                }
+                            ]
+                        }
+                    ]
+                }
+        blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizers)
+        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    else:
+        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizers)
+    # datasetのcache_latentsを呼ばなければ、生の画像が返る
+    current_epoch = Value("i", 0)
+    current_step = Value("i", 0)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+    # acceleratorを準備する
+    logger.info("prepare accelerator")
+    args.deepspeed = False
+    accelerator = train_util.prepare_accelerator(args)
+    # mixed precisionに対応した型を用意しておき適宜castする
+    weight_dtype, _ = train_util.prepare_dtype(args)
+    vae_dtype = torch.float32 if args.no_half_vae else weight_dtype
+    # モデルを読み込む
+    logger.info("load model")
+    if args.sdxl:
+        (_, _, _, vae, _, _, _) = sdxl_train_util.load_target_model(args, accelerator, "sdxl", weight_dtype)
+    else:
+        _, vae, _, _ = train_util.load_target_model(args, weight_dtype, accelerator)
+    if torch.__version__ >= "2.0.0":  # PyTorch 2.0.0 以上対応のxformersなら以下が使える
+        vae.set_use_memory_efficient_attention_xformers(args.xformers)
+    vae.to(accelerator.device, dtype=vae_dtype)
+    vae.requires_grad_(False)
+    vae.eval()
+    # dataloaderを準備する
+    train_dataset_group.set_caching_mode("latents")
+    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset_group,
+        batch_size=1,
+        shuffle=True,
+        collate_fn=collator,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
+    # acceleratorを使ってモデルを準備する：マルチGPUで使えるようになるはず
+    train_dataloader = accelerator.prepare(train_dataloader)
+    # データ取得のためのループ
+    for batch in tqdm(train_dataloader):
+        b_size = len(batch["images"])
+        vae_batch_size = b_size if args.vae_batch_size is None else args.vae_batch_size
+        flip_aug = batch["flip_aug"]
+        alpha_mask = batch["alpha_mask"]
+        random_crop = batch["random_crop"]
+        bucket_reso = batch["bucket_reso"]
+        # バッチを分割して処理する
+        for i in range(0, b_size, vae_batch_size):
+            images = batch["images"][i : i + vae_batch_size]
+            absolute_paths = batch["absolute_paths"][i : i + vae_batch_size]
+            resized_sizes = batch["resized_sizes"][i : i + vae_batch_size]
+            image_infos = []
+            for i, (image, absolute_path, resized_size) in enumerate(zip(images, absolute_paths, resized_sizes)):
+                image_info = train_util.ImageInfo(absolute_path, 1, "dummy", False, absolute_path)
+                image_info.image = image
+                image_info.bucket_reso = bucket_reso
+                image_info.resized_size = resized_size
+                image_info.latents_npz = os.path.splitext(absolute_path)[0] + ".npz"
+                if args.skip_existing:
+                    if train_util.is_disk_cached_latents_is_expected(
+                        image_info.bucket_reso, image_info.latents_npz, flip_aug, alpha_mask
+                    ):
+                        logger.warning(f"Skipping {image_info.latents_npz} because it already exists.")
+                        continue
+                image_infos.append(image_info)
+            if len(image_infos) > 0:
+                train_util.cache_batch_latents(vae, True, image_infos, flip_aug, alpha_mask, random_crop)
+    accelerator.wait_for_everyone()
+    accelerator.print(f"Finished caching latents for {len(train_dataset_group)} batches.")
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    add_logging_arguments(parser)
+    train_util.add_sd_models_arguments(parser)
+    train_util.add_training_arguments(parser, True)
+    train_util.add_dataset_arguments(parser, True, True, True)
+    config_util.add_config_arguments(parser)
+    parser.add_argument("--sdxl", action="store_true", help="Use SDXL model / SDXLモデルを使用する")
+    parser.add_argument(
+        "--no_half_vae",
+        action="store_true",
+        help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
+    )
+    parser.add_argument(
+        "--skip_existing",
+        action="store_true",
+        help="skip images if npz already exists (both normal and flipped exists if flip_aug is enabled) / npzが既に存在する画像をスキップする（flip_aug有効時は通常、反転の両方が存在する画像をスキップ）",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    args = train_util.read_config_from_file(args, parser)
+    cache_to_disk(args)

cache_text_encoder_outputs.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# text encoder出力のdiskへの事前キャッシュを行う / cache text encoder outputs to disk in advance
+import argparse
+import math
+from multiprocessing import Value
+import os
+from accelerate.utils import set_seed
+import torch
+from tqdm import tqdm
+from library import config_util
+from library import train_util
+from library import sdxl_train_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+from library.utils import setup_logging, add_logging_arguments
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def cache_to_disk(args: argparse.Namespace) -> None:
+    setup_logging(args, reset=True)
+    train_util.prepare_dataset_args(args, True)
+    # check cache arg
+    assert (
+        args.cache_text_encoder_outputs_to_disk
+    ), "cache_text_encoder_outputs_to_disk must be True / cache_text_encoder_outputs_to_diskはTrueである必要があります"
+    # できるだけ準備はしておくが今のところSDXLのみしか動かない
+    assert (
+        args.sdxl
+    ), "cache_text_encoder_outputs_to_disk is only available for SDXL / cache_text_encoder_outputs_to_diskはSDXLのみ利用可能です"
+    use_dreambooth_method = args.in_json is None
+    if args.seed is not None:
+        set_seed(args.seed)  # 乱数系列を初期化する
+    # tokenizerを準備する：datasetを動かすために必要
+    if args.sdxl:
+        tokenizer1, tokenizer2 = sdxl_train_util.load_tokenizers(args)
+        tokenizers = [tokenizer1, tokenizer2]
+    else:
+        tokenizer = train_util.load_tokenizer(args)
+        tokenizers = [tokenizer]
+    # データセットを準備する
+    if args.dataset_class is None:
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, False, True))
+        if args.dataset_config is not None:
+            logger.info(f"Load dataset config from {args.dataset_config}")
+            user_config = config_util.load_user_config(args.dataset_config)
+            ignored = ["train_data_dir", "in_json"]
+            if any(getattr(args, attr) is not None for attr in ignored):
+                logger.warning(
+                    "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                        ", ".join(ignored)
+                    )
+                )
+        else:
+            if use_dreambooth_method:
+                logger.info("Using DreamBooth method.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
+                                args.train_data_dir, args.reg_data_dir
+                            )
+                        }
+                    ]
+                }
+            else:
+                logger.info("Training with captions.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": [
+                                {
+                                    "image_dir": args.train_data_dir,
+                                    "metadata_file": args.in_json,
+                                }
+                            ]
+                        }
+                    ]
+                }
+        blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizers)
+        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    else:
+        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizers)
+    current_epoch = Value("i", 0)
+    current_step = Value("i", 0)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+    # acceleratorを準備する
+    logger.info("prepare accelerator")
+    args.deepspeed = False
+    accelerator = train_util.prepare_accelerator(args)
+    # mixed precisionに対応した型を用意しておき適宜castする
+    weight_dtype, _ = train_util.prepare_dtype(args)
+    # モデルを読み込む
+    logger.info("load model")
+    if args.sdxl:
+        (_, text_encoder1, text_encoder2, _, _, _, _) = sdxl_train_util.load_target_model(args, accelerator, "sdxl", weight_dtype)
+        text_encoders = [text_encoder1, text_encoder2]
+    else:
+        text_encoder1, _, _, _ = train_util.load_target_model(args, weight_dtype, accelerator)
+        text_encoders = [text_encoder1]
+    for text_encoder in text_encoders:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+        text_encoder.requires_grad_(False)
+        text_encoder.eval()
+    # dataloaderを準備する
+    train_dataset_group.set_caching_mode("text")
+    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset_group,
+        batch_size=1,
+        shuffle=True,
+        collate_fn=collator,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
+    # acceleratorを使ってモデルを準備する：マルチGPUで使えるようになるはず
+    train_dataloader = accelerator.prepare(train_dataloader)
+    # データ取得のためのループ
+    for batch in tqdm(train_dataloader):
+        absolute_paths = batch["absolute_paths"]
+        input_ids1_list = batch["input_ids1_list"]
+        input_ids2_list = batch["input_ids2_list"]
+        image_infos = []
+        for absolute_path, input_ids1, input_ids2 in zip(absolute_paths, input_ids1_list, input_ids2_list):
+            image_info = train_util.ImageInfo(absolute_path, 1, "dummy", False, absolute_path)
+            image_info.text_encoder_outputs_npz = os.path.splitext(absolute_path)[0] + train_util.TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX
+            image_info
+            if args.skip_existing:
+                if os.path.exists(image_info.text_encoder_outputs_npz):
+                    logger.warning(f"Skipping {image_info.text_encoder_outputs_npz} because it already exists.")
+                    continue
+            image_info.input_ids1 = input_ids1
+            image_info.input_ids2 = input_ids2
+            image_infos.append(image_info)
+        if len(image_infos) > 0:
+            b_input_ids1 = torch.stack([image_info.input_ids1 for image_info in image_infos])
+            b_input_ids2 = torch.stack([image_info.input_ids2 for image_info in image_infos])
+            train_util.cache_batch_text_encoder_outputs(
+                image_infos, tokenizers, text_encoders, args.max_token_length, True, b_input_ids1, b_input_ids2, weight_dtype
+            )
+    accelerator.wait_for_everyone()
+    accelerator.print(f"Finished caching latents for {len(train_dataset_group)} batches.")
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    add_logging_arguments(parser)
+    train_util.add_sd_models_arguments(parser)
+    train_util.add_training_arguments(parser, True)
+    train_util.add_dataset_arguments(parser, True, True, True)
+    config_util.add_config_arguments(parser)
+    sdxl_train_util.add_sdxl_training_arguments(parser)
+    parser.add_argument("--sdxl", action="store_true", help="Use SDXL model / SDXLモデルを使用する")
+    parser.add_argument(
+        "--skip_existing",
+        action="store_true",
+        help="skip images if npz already exists (both normal and flipped exists if flip_aug is enabled) / npzが既に存在する画像をスキップする（flip_aug有効時は通常、反転の両方が存在する画像をスキップ）",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    args = train_util.read_config_from_file(args, parser)
+    cache_to_disk(args)

canny.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import argparse
+import cv2
+import logging
+from library.utils import setup_logging
+setup_logging()
+logger = logging.getLogger(__name__)
+def canny(args):
+  img = cv2.imread(args.input)
+  img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+  canny_img = cv2.Canny(img, args.thres1, args.thres2)
+  # canny_img = 255 - canny_img
+  cv2.imwrite(args.output, canny_img)
+  logger.info("done!")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input", type=str, default=None, help="input path")
+  parser.add_argument("--output", type=str, default=None, help="output path")
+  parser.add_argument("--thres1", type=int, default=32, help="thres1")
+  parser.add_argument("--thres2", type=int, default=224, help="thres2")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args = parser.parse_args()
+  canny(args)

cextension.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import ctypes as ct
+from pathlib import Path
+from warnings import warn
+from .cuda_setup.main import evaluate_cuda_setup
+class CUDALibrary_Singleton(object):
+    _instance = None
+    def __init__(self):
+        raise RuntimeError("Call get_instance() instead")
+    def initialize(self):
+        binary_name = evaluate_cuda_setup()
+        package_dir = Path(__file__).parent
+        binary_path = package_dir / binary_name
+        if not binary_path.exists():
+            print(f"CUDA SETUP: TODO: compile library for specific version: {binary_name}")
+            legacy_binary_name = "libbitsandbytes.so"
+            print(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
+            binary_path = package_dir / legacy_binary_name
+            if not binary_path.exists():
+                print('CUDA SETUP: CUDA detection failed. Either CUDA driver not installed, CUDA not installed, or you have multiple conflicting CUDA libraries!')
+                print('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.')
+                raise Exception('CUDA SETUP: Setup Failed!')
+            # self.lib = ct.cdll.LoadLibrary(binary_path)
+            self.lib = ct.cdll.LoadLibrary(str(binary_path))            # $$$
+        else:
+            print(f"CUDA SETUP: Loading binary {binary_path}...")
+            # self.lib = ct.cdll.LoadLibrary(binary_path)
+            self.lib = ct.cdll.LoadLibrary(str(binary_path))            # $$$
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = cls.__new__(cls)
+            cls._instance.initialize()
+        return cls._instance
+lib = CUDALibrary_Singleton.get_instance().lib
+try:
+    lib.cadam32bit_g32
+    lib.get_context.restype = ct.c_void_p
+    lib.get_cusparse.restype = ct.c_void_p
+    COMPILED_WITH_CUDA = True
+except AttributeError:
+    warn(
+        "The installed version of bitsandbytes was compiled without GPU support. "
+        "8-bit optimizers and GPU quantization are unavailable."
+    )
+    COMPILED_WITH_CUDA = False

check_lora_weights.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import argparse
+import os
+import torch
+from safetensors.torch import load_file
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def main(file):
+    logger.info(f"loading: {file}")
+    if os.path.splitext(file)[1] == ".safetensors":
+        sd = load_file(file)
+    else:
+        sd = torch.load(file, map_location="cpu")
+    values = []
+    keys = list(sd.keys())
+    for key in keys:
+        if "lora_up" in key or "lora_down" in key or "lora_A" in key or "lora_B" in key or "oft_" in key:
+            values.append((key, sd[key]))
+    print(f"number of LoRA modules: {len(values)}")
+    if args.show_all_keys:
+        for key in [k for k in keys if k not in values]:
+            values.append((key, sd[key]))
+        print(f"number of all modules: {len(values)}")
+    for key, value in values:
+        value = value.to(torch.float32)
+        print(f"{key},{str(tuple(value.size())).replace(', ', '-')},{torch.mean(torch.abs(value))},{torch.min(torch.abs(value))}")
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file", type=str, help="model file to check / 重みを確認するモデルファイル")
+    parser.add_argument("-s", "--show_all_keys", action="store_true", help="show all keys / 全てのキーを表示する")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    main(args.file)

clean_captions_and_tags.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# このスクリプトのライセンスは、Apache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+import argparse
+import glob
+import os
+import json
+import re
+from tqdm import tqdm
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+PATTERN_HAIR_LENGTH = re.compile(r', (long|short|medium) hair, ')
+PATTERN_HAIR_CUT = re.compile(r', (bob|hime) cut, ')
+PATTERN_HAIR = re.compile(r', ([\w\-]+) hair, ')
+PATTERN_WORD = re.compile(r', ([\w\-]+|hair ornament), ')
+# 複数人がいるとき、複数の髪色や目の色が定義されていれば削除する
+PATTERNS_REMOVE_IN_MULTI = [
+    PATTERN_HAIR_LENGTH,
+    PATTERN_HAIR_CUT,
+    re.compile(r', [\w\-]+ eyes, '),
+    re.compile(r', ([\w\-]+ sleeves|sleeveless), '),
+    # 複数の髪型定義がある場合は削除する
+    re.compile(
+        r', (ponytail|braid|ahoge|twintails|[\w\-]+ bun|single hair bun|single side bun|two side up|two tails|[\w\-]+ braid|sidelocks), '),
+]
+def clean_tags(image_key, tags):
+  # replace '_' to ' '
+  tags = tags.replace('^_^', '^@@@^')
+  tags = tags.replace('_', ' ')
+  tags = tags.replace('^@@@^', '^_^')
+  # remove rating: deepdanbooruのみ
+  tokens = tags.split(", rating")
+  if len(tokens) == 1:
+    # WD14 taggerのときはこちらになるのでメッセージは出さない
+    # logger.info("no rating:")
+    # logger.info(f"{image_key} {tags}")
+    pass
+  else:
+    if len(tokens) > 2:
+      logger.info("multiple ratings:")
+      logger.info(f"{image_key} {tags}")
+    tags = tokens[0]
+  tags = ", " + tags.replace(", ", ", , ") + ", "     # カンマ付きで検索をするための身も蓋もない対策
+  # 複数の人物がいる場合は髪色等のタグを削除する
+  if 'girls' in tags or 'boys' in tags:
+    for pat in PATTERNS_REMOVE_IN_MULTI:
+      found = pat.findall(tags)
+      if len(found) > 1:                        # 二つ以上、タグがある
+        tags = pat.sub("", tags)
+    # 髪の特殊対応
+    srch_hair_len = PATTERN_HAIR_LENGTH.search(tags)   # 髪の長さタグは例外なので避けておく（全員が同じ髪の長さの場合）
+    if srch_hair_len:
+      org = srch_hair_len.group()
+      tags = PATTERN_HAIR_LENGTH.sub(", @@@, ", tags)
+    found = PATTERN_HAIR.findall(tags)
+    if len(found) > 1:
+      tags = PATTERN_HAIR.sub("", tags)
+    if srch_hair_len:
+      tags = tags.replace(", @@@, ", org)                   # 戻す
+  # white shirtとshirtみたいな重複タグの削除
+  found = PATTERN_WORD.findall(tags)
+  for word in found:
+    if re.search(f", ((\w+) )+{word}, ", tags):
+      tags = tags.replace(f", {word}, ", "")
+  tags = tags.replace(", , ", ", ")
+  assert tags.startswith(", ") and tags.endswith(", ")
+  tags = tags[2:-2]
+  return tags
+# 上から順に検索、置換される
+# ('置換元文字列', '置換後文字列')
+CAPTION_REPLACEMENTS = [
+    ('anime anime', 'anime'),
+    ('young ', ''),
+    ('anime girl', 'girl'),
+    ('cartoon female', 'girl'),
+    ('cartoon lady', 'girl'),
+    ('cartoon character', 'girl'),      # a or ~s
+    ('cartoon woman', 'girl'),
+    ('cartoon women', 'girls'),
+    ('cartoon girl', 'girl'),
+    ('anime female', 'girl'),
+    ('anime lady', 'girl'),
+    ('anime character', 'girl'),      # a or ~s
+    ('anime woman', 'girl'),
+    ('anime women', 'girls'),
+    ('lady', 'girl'),
+    ('female', 'girl'),
+    ('woman', 'girl'),
+    ('women', 'girls'),
+    ('people', 'girls'),
+    ('person', 'girl'),
+    ('a cartoon figure', 'a figure'),
+    ('a cartoon image', 'an image'),
+    ('a cartoon picture', 'a picture'),
+    ('an anime cartoon image', 'an image'),
+    ('a cartoon anime drawing', 'a drawing'),
+    ('a cartoon drawing', 'a drawing'),
+    ('girl girl', 'girl'),
+]
+def clean_caption(caption):
+  for rf, rt in CAPTION_REPLACEMENTS:
+    replaced = True
+    while replaced:
+      bef = caption
+      caption = caption.replace(rf, rt)
+      replaced = bef != caption
+  return caption
+def main(args):
+  if os.path.exists(args.in_json):
+    logger.info(f"loading existing metadata: {args.in_json}")
+    with open(args.in_json, "rt", encoding='utf-8') as f:
+      metadata = json.load(f)
+  else:
+    logger.error("no metadata / メタデータファイルがありません")
+    return
+  logger.info("cleaning captions and tags.")
+  image_keys = list(metadata.keys())
+  for image_key in tqdm(image_keys):
+    tags = metadata[image_key].get('tags')
+    if tags is None:
+      logger.error(f"image does not have tags / メタデータにタグがありません: {image_key}")
+    else:
+      org = tags
+      tags = clean_tags(image_key, tags)
+      metadata[image_key]['tags'] = tags
+      if args.debug and org != tags:
+        logger.info("FROM: " + org)
+        logger.info("TO:   " + tags)
+    caption = metadata[image_key].get('caption')
+    if caption is None:
+      logger.error(f"image does not have caption / メタデータにキャプションがありません: {image_key}")
+    else:
+      org = caption
+      caption = clean_caption(caption)
+      metadata[image_key]['caption'] = caption
+      if args.debug and org != caption:
+        logger.info("FROM: " + org)
+        logger.info("TO:   " + caption)
+  # metadataを書き出して終わり
+  logger.info(f"writing metadata: {args.out_json}")
+  with open(args.out_json, "wt", encoding='utf-8') as f:
+    json.dump(metadata, f, indent=2)
+  logger.info("done!")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  # parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("in_json", type=str, help="metadata file to input / 読み込むメタデータファイル")
+  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+  parser.add_argument("--debug", action="store_true", help="debug mode")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args, unknown = parser.parse_known_args()
+  if len(unknown) == 1:
+    logger.warning("WARNING: train_data_dir argument is removed. This script will not work with three arguments in future. Please specify two arguments: in_json and out_json.")
+    logger.warning("All captions and tags in the metadata are processed.")
+    logger.warning("警告: train_data_dir引数は不要になりました。将来的には三つの引数を指定すると動かなくなる予定です。読み込み元のメタデータと書き出し先の二つの引数だけ指定してください。")
+    logger.warning("メタデータ内のすべてのキャプションとタグが処理されます。")
+    args.in_json = args.out_json
+    args.out_json = unknown[0]
+  elif len(unknown) > 0:
+    raise ValueError(f"error: unrecognized arguments: {unknown}")
+  main(args)

config_README-en.md ADDED Viewed

	@@ -0,0 +1,386 @@

+Original Source by kohya-ss
+First version:
+A.I Translation by Model: NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO, editing by Darkstorm2150
+Some parts are manually added.
+# Config Readme
+This README is about the configuration files that can be passed with the `--dataset_config` option.
+## Overview
+By passing a configuration file, users can make detailed settings.
+* Multiple datasets can be configured
+   * For example, by setting `resolution` for each dataset, they can be mixed and trained.
+   * In training methods that support both the DreamBooth approach and the fine-tuning approach, datasets of the DreamBooth method and the fine-tuning method can be mixed.
+* Settings can be changed for each subset
+   * A subset is a partition of the dataset by image directory or metadata. Several subsets make up a dataset.
+   * Options such as `keep_tokens` and `flip_aug` can be set for each subset. On the other hand, options such as `resolution` and `batch_size` can be set for each dataset, and their values are common among subsets belonging to the same dataset. More details will be provided later.
+The configuration file format can be JSON or TOML. Considering the ease of writing, it is recommended to use [TOML](https://toml.io/ja/v1.0.0-rc.2). The following explanation assumes the use of TOML.
+Here is an example of a configuration file written in TOML.
+```toml
+[general]
+shuffle_caption = true
+caption_extension = '.txt'
+keep_tokens = 1
+# This is a DreamBooth-style dataset
+[[datasets]]
+resolution = 512
+batch_size = 4
+keep_tokens = 2
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+  class_tokens = 'hoge girl'
+  # This subset uses keep_tokens = 2 (the value of the parent datasets)
+  [[datasets.subsets]]
+  image_dir = 'C:\fuga'
+  class_tokens = 'fuga boy'
+  keep_tokens = 3
+  [[datasets.subsets]]
+  is_reg = true
+  image_dir = 'C:\reg'
+  class_tokens = 'human'
+  keep_tokens = 1
+# This is a fine-tuning dataset
+[[datasets]]
+resolution = [768, 768]
+batch_size = 2
+  [[datasets.subsets]]
+  image_dir = 'C:\piyo'
+  metadata_file = 'C:\piyo\piyo_md.json'
+  # This subset uses keep_tokens = 1 (the value of [general])
+```
+In this example, three directories are trained as a DreamBooth-style dataset at 512x512 (batch size 4), and one directory is trained as a fine-tuning dataset at 768x768 (batch size 2).
+## Settings for datasets and subsets
+Settings for datasets and subsets are divided into several registration locations.
+* `[general]`
+    * This is where options that apply to all datasets or all subsets are specified.
+    * If there are options with the same name in the dataset-specific or subset-specific settings, the dataset-specific or subset-specific settings take precedence.
+* `[[datasets]]`
+    * `datasets` is where settings for datasets are registered. This is where options that apply individually to each dataset are specified.
+	* If there are subset-specific settings, the subset-specific settings take precedence.
+* `[[datasets.subsets]]`
+    * `datasets.subsets` is where settings for subsets are registered. This is where options that apply individually to each subset are specified.
+Here is an image showing the correspondence between image directories and registration locations in the previous example.
+```
+C:\
+├─ hoge  ->  [[datasets.subsets]] No.1  ┐                        ┐
+├─ fuga  ->  [[datasets.subsets]] No.2  |->  [[datasets]] No.1   |->  [general]
+├─ reg   ->  [[datasets.subsets]] No.3  ┘                        |
+└─ piyo  ->  [[datasets.subsets]] No.4  -->  [[datasets]] No.2   ┘
+```
+The image directory corresponds to each `[[datasets.subsets]]`. Then, multiple `[[datasets.subsets]]` are combined to form one `[[datasets]]`. All `[[datasets]]` and `[[datasets.subsets]]` belong to `[general]`.
+The available options for each registration location may differ, but if the same option is specified, the value in the lower registration location will take precedence. You can check how the `keep_tokens` option is handled in the previous example for better understanding.
+Additionally, the available options may vary depending on the method that the learning approach supports.
+* Options specific to the DreamBooth method
+* Options specific to the fine-tuning method
+* Options available when using the caption dropout technique
+When using both the DreamBooth method and the fine-tuning method, they can be used together with a learning approach that supports both.
+When using them together, a point to note is that the method is determined based on the dataset, so it is not possible to mix DreamBooth method subsets and fine-tuning method subsets within the same dataset.
+In other words, if you want to use both methods together, you need to set up subsets of different methods belonging to different datasets.
+In terms of program behavior, if the `metadata_file` option exists, it is determined to be a subset of fine-tuning. Therefore, for subsets belonging to the same dataset, as long as they are either "all have the `metadata_file` option" or "all have no `metadata_file` option," there is no problem.
+Below, the available options will be explained. For options with the same name as the command-line argument, the explanation will be omitted in principle. Please refer to other READMEs.
+### Common options for all learning methods
+These are options that can be specified regardless of the learning method.
+#### Data set specific options
+These are options related to the configuration of the data set. They cannot be described in `datasets.subsets`.
+| Option Name | Example Setting | `[general]` | `[[datasets]]` |
+| ---- | ---- | ---- | ---- |
+| `batch_size` | `1` | o | o |
+| `bucket_no_upscale` | `true` | o | o |
+| `bucket_reso_steps` | `64` | o | o |
+| `enable_bucket` | `true` | o | o |
+| `max_bucket_reso` | `1024` | o | o |
+| `min_bucket_reso` | `128` | o | o |
+| `resolution` | `256`, `[512, 512]` | o | o |
+* `batch_size`
+    * This corresponds to the command-line argument `--train_batch_size`.
+* `max_bucket_reso`, `min_bucket_reso`
+    * Specify the maximum and minimum resolutions of the bucket. It must be divisible by `bucket_reso_steps`.
+These settings are fixed per dataset. That means that subsets belonging to the same dataset will share these settings. For example, if you want to prepare datasets with different resolutions, you can define them as separate datasets as shown in the example above, and set different resolutions for each.
+#### Options for Subsets
+These options are related to subset configuration.
+| Option Name | Example | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `color_aug` | `false` | o | o | o |
+| `face_crop_aug_range` | `[1.0, 3.0]` | o | o | o |
+| `flip_aug` | `true` | o | o | o |
+| `keep_tokens` | `2` | o | o | o |
+| `num_repeats` | `10` | o | o | o |
+| `random_crop` | `false` | o | o | o |
+| `shuffle_caption` | `true` | o | o | o |
+| `caption_prefix` | `"masterpiece, best quality, "` | o | o | o |
+| `caption_suffix` | `", from side"` | o | o | o |
+| `caption_separator` |  (not specified) | o | o | o |
+| `keep_tokens_separator` | `“|||”` | o | o | o |
+| `secondary_separator` | `“;;;”` | o | o | o |
+| `enable_wildcard` | `true` | o | o | o |
+* `num_repeats`
+    * Specifies the number of repeats for images in a subset. This is equivalent to `--dataset_repeats` in fine-tuning but can be specified for any training method.
+* `caption_prefix`, `caption_suffix`
+    * Specifies the prefix and suffix strings to be appended to the captions. Shuffling is performed with these strings included. Be cautious when using `keep_tokens`.
+* `caption_separator`
+    * Specifies the string to separate the tags. The default is `,`. This option is usually not necessary to set.
+* `keep_tokens_separator`
+    * Specifies the string to separate the parts to be fixed in the caption. For example, if you specify `aaa, bbb ||| ccc, ddd, eee, fff ||| ggg, hhh`, the parts `aaa, bbb` and `ggg, hhh` will remain, and the rest will be shuffled and dropped. The comma in between is not necessary. As a result, the prompt will be `aaa, bbb, eee, ccc, fff, ggg, hhh` or `aaa, bbb, fff, ccc, eee, ggg, hhh`, etc.
+* `secondary_separator`
+    * Specifies an additional separator. The part separated by this separator is treated as one tag and is shuffled and dropped. It is then replaced by `caption_separator`. For example, if you specify `aaa;;;bbb;;;ccc`, it will be replaced by `aaa,bbb,ccc` or dropped together.
+* `enable_wildcard`
+    * Enables wildcard notation. This will be explained later.
+### DreamBooth-specific options
+DreamBooth-specific options only exist as subsets-specific options.
+#### Subset-specific options
+Options related to the configuration of DreamBooth subsets.
+| Option Name | Example Setting | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `image_dir` | `'C:\hoge'` | - | - | o (required) |
+| `caption_extension` | `".txt"` | o | o | o |
+| `class_tokens` | `"sks girl"` | - | - | o |
+| `cache_info` | `false` | o | o | o |
+| `is_reg` | `false` | - | - | o |
+Firstly, note that for `image_dir`, the path to the image files must be specified as being directly in the directory. Unlike the previous DreamBooth method, where images had to be placed in subdirectories, this is not compatible with that specification. Also, even if you name the folder something like "5_cat", the number of repeats of the image and the class name will not be reflected. If you want to set these individually, you will need to explicitly specify them using `num_repeats` and `class_tokens`.
+* `image_dir`
+    * Specifies the path to the image directory. This is a required option.
+    * Images must be placed directly under the directory.
+* `class_tokens`
+    * Sets the class tokens.
+    * Only used during training when a corresponding caption file does not exist. The determination of whether or not to use it is made on a per-image basis. If `class_tokens` is not specified and a caption file is not found, an error will occur.
+* `cache_info`
+    * Specifies whether to cache the image size and caption. If not specified, it is set to `false`. The cache is saved in `metadata_cache.json` in `image_dir`.
+    * Caching speeds up the loading of the dataset after the first time. It is effective when dealing with thousands of images or more.
+* `is_reg`
+    * Specifies whether the subset images are for normalization. If not specified, it is set to `false`, meaning that the images are not for normalization.
+### Fine-tuning method specific options
+The options for the fine-tuning method only exist for subset-specific options.
+#### Subset-specific options
+These options are related to the configuration of the fine-tuning method's subsets.
+| Option name | Example setting | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `image_dir` | `'C:\hoge'` | - | - | o |
+| `metadata_file` | `'C:\piyo\piyo_md.json'` | - | - | o (required) |
+* `image_dir`
+    * Specify the path to the image directory. Unlike the DreamBooth method, specifying it is not mandatory, but it is recommended to do so.
+        * The case where it is not necessary to specify is when the `--full_path` is added to the command line when generating the metadata file.
+    * The images must be placed directly under the directory.
+* `metadata_file`
+    * Specify the path to the metadata file used for the subset. This is a required option.
+        * It is equivalent to the command-line argument `--in_json`.
+    * Due to the specification that a metadata file must be specified for each subset, it is recommended to avoid creating a metadata file with images from different directories as a single metadata file. It is strongly recommended to prepare a separate metadata file for each image directory and register them as separate subsets.
+### Options available when caption dropout method can be used
+The options available when the caption dropout method can be used exist only for subsets. Regardless of whether it's the DreamBooth method or fine-tuning method, if it supports caption dropout, it can be specified.
+#### Subset-specific options
+Options related to the setting of subsets that caption dropout can be used for.
+| Option Name | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- |
+| `caption_dropout_every_n_epochs` | o | o | o |
+| `caption_dropout_rate` | o | o | o |
+| `caption_tag_dropout_rate` | o | o | o |
+## Behavior when there are duplicate subsets
+In the case of the DreamBooth dataset, if there are multiple `image_dir` directories with the same content, they are considered to be duplicate subsets. For the fine-tuning dataset, if there are multiple `metadata_file` files with the same content, they are considered to be duplicate subsets. If duplicate subsets exist in the dataset, subsequent subsets will be ignored.
+However, if they belong to different datasets, they are not considered duplicates. For example, if you have subsets with the same `image_dir` in different datasets, they will not be considered duplicates. This is useful when you want to train with the same image but with different resolutions.
+```toml
+# If data sets exist separately, they are not considered duplicates and are both used for training.
+[[datasets]]
+resolution = 512
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+[[datasets]]
+resolution = 768
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+```
+## Command Line Argument and Configuration File
+There are options in the configuration file that have overlapping roles with command line argument options.
+The following command line argument options are ignored if a configuration file is passed:
+* `--train_data_dir`
+* `--reg_data_dir`
+* `--in_json`
+The following command line argument options are given priority over the configuration file options if both are specified simultaneously. In most cases, they have the same names as the corresponding options in the configuration file.
+| Command Line Argument Option   | Prioritized Configuration File Option |
+| ------------------------------- | ------------------------------------- |
+| `--bucket_no_upscale`           |                                       |
+| `--bucket_reso_steps`           |                                       |
+| `--caption_dropout_every_n_epochs` |                                       |
+| `--caption_dropout_rate`        |                                       |
+| `--caption_extension`           |                                       |
+| `--caption_tag_dropout_rate`    |                                       |
+| `--color_aug`                   |                                       |
+| `--dataset_repeats`             | `num_repeats`                          |
+| `--enable_bucket`               |                                       |
+| `--face_crop_aug_range`         |                                       |
+| `--flip_aug`                    |                                       |
+| `--keep_tokens`                 |                                       |
+| `--min_bucket_reso`              |                                       |
+| `--random_crop`                 |                                       |
+| `--resolution`                  |                                       |
+| `--shuffle_caption`             |                                       |
+| `--train_batch_size`            | `batch_size`                           |
+## Error Guide
+Currently, we are using an external library to check if the configuration file is written correctly, but the development has not been completed, and there is a problem that the error message is not clear. In the future, we plan to improve this problem.
+As a temporary measure, we will list common errors and their solutions. If you encounter an error even though it should be correct or if the error content is not understandable, please contact us as it may be a bug.
+* `voluptuous.error.MultipleInvalid: required key not provided @ ...`: This error occurs when a required option is not provided. It is highly likely that you forgot to specify the option or misspelled the option name.
+  * The error location is indicated by `...` in the error message. For example, if you encounter an error like `voluptuous.error.MultipleInvalid: required key not provided @ data['datasets'][0]['subsets'][0]['image_dir']`, it means that the `image_dir` option does not exist in the 0th `subsets` of the 0th `datasets` setting.
+* `voluptuous.error.MultipleInvalid: expected int for dictionary value @ ...`: This error occurs when the specified value format is incorrect. It is highly likely that the value format is incorrect. The `int` part changes depending on the target option. The example configurations in this README may be helpful.
+* `voluptuous.error.MultipleInvalid: extra keys not allowed @ ...`: This error occurs when there is an option name that is not supported. It is highly likely that you misspelled the option name or mistakenly included it.
+## Miscellaneous
+### Multi-line captions
+By setting `enable_wildcard = true`, multiple-line captions are also enabled. If the caption file consists of multiple lines, one line is randomly selected as the caption.
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, microphone, stage
+a girl with a microphone standing on a stage
+detailed digital art of a girl with a microphone on a stage
+```
+It can be combined with wildcard notation.
+In metadata files, you can also specify multiple-line captions. In the `.json` metadata file, use `\n` to represent a line break. If the caption file consists of multiple lines, `merge_captions_to_metadata.py` will create a metadata file in this format.
+The tags in the metadata (`tags`) are added to each line of the caption.
+```json
+{
+    "/path/to/image.png": {
+        "caption": "a cartoon of a frog with the word frog on it\ntest multiline caption1\ntest multiline caption2",
+        "tags": "open mouth, simple background, standing, no humans, animal, black background, frog, animal costume, animal focus"
+    },
+    ...
+}
+```
+In this case, the actual caption will be `a cartoon of a frog with the word frog on it, open mouth, simple background ...`, `test multiline caption1, open mouth, simple background ...`, `test multiline caption2, open mouth, simple background ...`, etc.
+### Example of configuration file : `secondary_separator`, wildcard notation, `keep_tokens_separator`, etc.
+```toml
+[general]
+flip_aug = true
+color_aug = false
+resolution = [1024, 1024]
+[[datasets]]
+batch_size = 6
+enable_bucket = true
+bucket_no_upscale = true
+caption_extension = ".txt"
+keep_tokens_separator= "|||"
+shuffle_caption = true
+caption_tag_dropout_rate = 0.1
+secondary_separator = ";;;" # subset 側に書くこともできます / can be written in the subset side
+enable_wildcard = true # 同上 / same as above
+  [[datasets.subsets]]
+  image_dir = "/path/to/image_dir"
+  num_repeats = 1
+  # ||| の前後はカンマは不要です（自動的に追加されます） / No comma is required before and after ||| (it is added automatically)
+  caption_prefix = "1girl, hatsune miku, vocaloid |||"
+  # ||| の後はシャッフル、drop されず残ります / After |||, it is not shuffled or dropped and remains
+  # 単純に文字列として連結されるので、カンマなどは自分で入れる必要があります / It is simply concatenated as a string, so you need to put commas yourself
+  caption_suffix = ", anime screencap ||| masterpiece, rating: general"
+```
+### Example of caption, secondary_separator notation: `secondary_separator = ";;;"`
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, sky;;;cloud;;;day, outdoors
+```
+The part `sky;;;cloud;;;day` is replaced with `sky,cloud,day` without shuffling or dropping. When shuffling and dropping are enabled, it is processed as a whole (as one tag). For example, it becomes `vocaloid, 1girl, upper body, sky,cloud,day, outdoors, hatsune miku` (shuffled) or `vocaloid, 1girl, outdoors, looking at viewer, upper body, hatsune miku` (dropped).
+### Example of caption, enable_wildcard notation: `enable_wildcard = true`
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, {simple|white} background
+```
+`simple` or `white` is randomly selected, and it becomes `simple background` or `white background`.
+```txt
+1girl, hatsune miku, vocaloid, {{retro style}}
+```
+If you want to include `{` or `}` in the tag string, double them like `{{` or `}}` (in this example, the actual caption used for training is `{retro style}`).
+### Example of caption, `keep_tokens_separator` notation: `keep_tokens_separator = "|||"`
+```txt
+1girl, hatsune miku, vocaloid ||| stage, microphone, white shirt, smile ||| best quality, rating: general
+```
+It becomes `1girl, hatsune miku, vocaloid, microphone, stage, white shirt, best quality, rating: general` or `1girl, hatsune miku, vocaloid, white shirt, smile, stage, microphone, best quality, rating: general` etc.

config_README-ja.md ADDED Viewed

	@@ -0,0 +1,388 @@

+`--dataset_config` で渡すことができる設定ファイルに関する説明です。
+## 概要
+設定ファイルを渡すことにより、ユーザが細かい設定を行えるようにします。
+* 複数のデータセットが設定可能になります
+    * 例えば `resolution` をデータセットごとに設定して、それらを混合して学習できます。
+    * DreamBooth の手法と fine tuning の手法の両方に対応している学習方法では、DreamBooth 方式と fine tuning 方式のデータセットを混合することが可能です。
+* サブセットごとに設定を変更することが可能になります
+    * データセットを画像ディレクトリ別またはメタデータ別に分割したものがサブセットです。いくつかのサブセットが集まってデータセットを構成します。
+    * `keep_tokens` や `flip_aug` 等のオプションはサブセットごとに設定可能です。一方、`resolution` や `batch_size` といったオプションはデータセットごとに設定可能で、同じデータセットに属するサブセットでは値が共通になります。詳しくは後述します。
+設定ファイルの形式は JSON か TOML を利用できます。記述のしやすさを考えると [TOML](https://toml.io/ja/v1.0.0-rc.2) を利用するのがオススメです。以下、TOML の利用を前提に説明します。
+TOML で記述した設定ファイルの例です。
+```toml
+[general]
+shuffle_caption = true
+caption_extension = '.txt'
+keep_tokens = 1
+# これは DreamBooth 方式のデータセット
+[[datasets]]
+resolution = 512
+batch_size = 4
+keep_tokens = 2
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+  class_tokens = 'hoge girl'
+  # このサブセットは keep_tokens = 2 （所属する datasets の値が使われる）
+  [[datasets.subsets]]
+  image_dir = 'C:\fuga'
+  class_tokens = 'fuga boy'
+  keep_tokens = 3
+  [[datasets.subsets]]
+  is_reg = true
+  image_dir = 'C:\reg'
+  class_tokens = 'human'
+  keep_tokens = 1
+# これは fine tuning 方式のデータセット
+[[datasets]]
+resolution = [768, 768]
+batch_size = 2
+  [[datasets.subsets]]
+  image_dir = 'C:\piyo'
+  metadata_file = 'C:\piyo\piyo_md.json'
+  # このサブセットは keep_tokens = 1 （general の値が使われる）
+```
+この例では、3 つのディレクトリを DreamBooth 方式のデータセットとして 512x512 (batch size 4) で学習させ、1 つのディレクトリを fine tuning 方式のデータセットとして 768x768 (batch size 2) で学習させることになります。
+## データセット・サブセットに関する設定
+データセット・サブセットに関する設定は、登録可能な箇所がいくつかに分かれています。
+* `[general]`
+    * 全データセットまたは全サブセットに適用されるオプションを指定する箇所です。
+    * データセットごとの設定及びサブセットごとの設定に同名のオプションが存在していた場合には、データセット・サブセットごとの設定が優先されます。
+* `[[datasets]]`
+    * `datasets` はデータセットに関する設定の登録箇所になります。各データセットに個別に適用されるオプションを指定する箇所です。
+    * サブセットごとの設定が存在していた場合には、サブセットごとの設定が優先されます。
+* `[[datasets.subsets]]`
+    * `datasets.subsets` はサブセットに関する設定の登録箇所になります。各サブセットに個別に適用されるオプションを指定する箇所です。
+先程の例における、画像ディレクトリと登録箇所の対応に関するイメージ図です。
+```
+C:\
+├─ hoge  ->  [[datasets.subsets]] No.1  ┐                        ┐
+├─ fuga  ->  [[datasets.subsets]] No.2  |->  [[datasets]] No.1   |->  [general]
+├─ reg   ->  [[datasets.subsets]] No.3  ┘                        |
+└─ piyo  ->  [[datasets.subsets]] No.4  -->  [[datasets]] No.2   ┘
+```
+画像ディレクトリがそれぞれ1つの `[[datasets.subsets]]` に対応しています。そして `[[datasets.subsets]]` が1つ以上組み合わさって1つの `[[datasets]]` を構成します。`[general]` には全ての `[[datasets]]`, `[[datasets.subsets]]` が属します。
+登録箇所ごとに指定可能なオプションは異なりますが、同名のオプションが指定された場合は下位の登録箇所にある値が優先されます。先程の例の `keep_tokens` オプションの扱われ方を確認してもらうと理解しやすいかと思います。
+加えて、学習方法が対応している手法によっても指定可能なオプションが変化します。
+* DreamBooth 方式専用のオプション
+* fine tuning 方式専用のオプション
+* caption dropout の手法が使える場合のオプション
+DreamBooth の手法と fine tuning の手法の両方とも利用可能な学習方法では、両者を併用することができます。
+併用する際の注意点として、DreamBooth 方式なのか fine tuning 方式なのかはデータセット単位で判別を行っているため、同じデータセット中に DreamBooth 方式のサブセットと fine tuning 方式のサブセットを混在させることはできません。
+つまり、これらを併用したい場合には異なる方式のサブセットが異なるデータセットに所属するように設定する必要があります。
+プログラムの挙動としては、後述する `metadata_file` オプションが存在していたら fine tuning 方式のサブセットだと判断します。
+そのため、同一のデータセットに所属するサブセットについて言うと、「全てが `metadata_file` オプションを持つ」か「全てが `metadata_file` オプションを持たない」かのどちらかになっていれば問題ありません。
+以下、利用可能なオプションを説明します。コマンドライン引数と名称が同一のオプションについては、基本的に説明を割愛します。他の README を参照してください。
+### 全学習方法で共通のオプション
+学習方法によらずに指定可能なオプションです。
+#### データセット向けオプション
+データセットの設定に関わるオプションです。`datasets.subsets` には記述できません。
+| オプション名 | 設定例 | `[general]` | `[[datasets]]` |
+| ---- | ---- | ---- | ---- |
+| `batch_size` | `1` | o | o |
+| `bucket_no_upscale` | `true` | o | o |
+| `bucket_reso_steps` | `64` | o | o |
+| `enable_bucket` | `true` | o | o |
+| `max_bucket_reso` | `1024` | o | o |
+| `min_bucket_reso` | `128` | o | o |
+| `resolution` | `256`, `[512, 512]` | o | o |
+* `batch_size`
+    * コマンドライン引数の `--train_batch_size` と同等です。
+* `max_bucket_reso`, `min_bucket_reso`
+    * bucketの最大、最小解像度を指定します。`bucket_reso_steps` で割り切れる必要があります。
+これらの設定はデータセットごとに固定です。
+つまり、データセットに所属するサブセットはこれらの設定を共有することになります。
+例えば解像度が異なるデータセットを用意したい場合は、上に挙げた例のように別々のデータセットとして定義すれば別々の解像度を設定可能です。
+#### サブセット向けオプション
+サブセットの設定に関わるオプションです。
+| オプション名 | 設定例 | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `color_aug` | `false` | o | o | o |
+| `face_crop_aug_range` | `[1.0, 3.0]` | o | o | o |
+| `flip_aug` | `true` | o | o | o |
+| `keep_tokens` | `2` | o | o | o |
+| `num_repeats` | `10` | o | o | o |
+| `random_crop` | `false` | o | o | o |
+| `shuffle_caption` | `true` | o | o | o |
+| `caption_prefix` | `“masterpiece, best quality, ”` | o | o | o |
+| `caption_suffix` | `“, from side”` | o | o | o |
+| `caption_separator` | （通常は設定しません） | o | o | o |
+| `keep_tokens_separator` | `“|||”` | o | o | o |
+| `secondary_separator` | `“;;;”` | o | o | o |
+| `enable_wildcard` | `true` | o | o | o |
+* `num_repeats`
+    * サブセットの画像の繰り返し回数を指定します。fine tuning における `--dataset_repeats` に相当しますが、`num_repeats` はどの学習方法でも指定可能です。
+* `caption_prefix`, `caption_suffix`
+    * キャプションの前、後に付与する文字列を指定します。シャッフルはこれらの文字列を含めた状態で行われます。`keep_tokens` を指定する場合には注意してください。
+* `caption_separator`
+    * タグを区切る文字列を指定します。デフォルトは `,` です。このオプションは通常は設定する必要はありません。
+* `keep_tokens_separator`
+    *  キャプションで固定したい部分を区切る文字列を指定します。たとえば `aaa, bbb ||| ccc, ddd, eee, fff ||| ggg, hhh` のように指定すると、`aaa, bbb` と `ggg, hhh` の部分はシャッフル、drop されず残ります。間のカンマは不要です。結果としてプロンプトは `aaa, bbb, eee, ccc, fff, ggg, hhh` や `aaa, bbb, fff, ccc, eee, ggg, hhh` などになります。
+* `secondary_separator`
+    * 追加の区切り文字を指定します。この区切り文字で区切られた部分は一つのタグとして扱われ、シャッフル、drop されます。その後、`caption_separator` に置き換えられます。たとえば `aaa;;;bbb;;;ccc` のように指定すると、`aaa,bbb,ccc` に置き換えられるか、まとめて drop されます。
+* `enable_wildcard`
+    * ワイルドカード���法および複数行キャプションを有効にします。ワイルドカード記法、複数行キャプションについては後述します。
+### DreamBooth 方式専用のオプション
+DreamBooth 方式のオプションは、サブセット向けオプションのみ存在します。
+#### サブセット向けオプション
+DreamBooth 方式のサブセットの設定に関わるオプションです。
+| オプション名 | 設定例 | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `image_dir` | `‘C:\hoge’` | - | - | o（必須） |
+| `caption_extension` | `".txt"` | o | o | o |
+| `class_tokens` | `“sks girl”` | - | - | o |
+| `cache_info` | `false` | o | o | o |
+| `is_reg` | `false` | - | - | o |
+まず注意点として、 `image_dir` には画像ファイルが直下に置かれているパスを指定する必要があります。従来の DreamBooth の手法ではサブディレクトリに画像を置く必要がありましたが、そちらとは仕様に互換性がありません。また、`5_cat` のようなフォルダ名にしても、画像の繰り返し回数とクラス名は反映されません。これらを個別に設定したい場合、`num_repeats` と `class_tokens` で明示的に指定する必要があることに注意してください。
+* `image_dir`
+    * 画像ディレクトリのパスを指定します。指定必須オプションです。
+    * 画像はディレクトリ直下に置かれている必要があります。
+* `class_tokens`
+    * クラストークンを設定します。
+    * 画像に対応する caption ファイルが存在しない場合にのみ学習時に利用されます。利用するかどうかの判定は画像ごとに行います。`class_tokens` を指定しなかった場合に caption ファイルも見つからなかった場合にはエラーになります。
+* `cache_info`
+    * 画像サイズ、キャプションをキャッシュするかどうかを指定します。指定しなかった場合は `false` になります。キャッシュは `image_dir` に `metadata_cache.json` というファイル名で保存されます。
+    * キャッシュを行うと、二回目以降のデータセット読み込みが高速化されます。数千枚以上の画像を扱う場合には有効です。
+* `is_reg`
+    * サブセットの画像が正規化用かどうかを指定します。指定しなかった場合は `false` として、つまり正規化画像ではないとして扱います。
+### fine tuning 方式専用のオプション
+fine tuning 方式のオプションは、サブセット向けオプションのみ存在します。
+#### サブセット向けオプション
+fine tuning 方式のサブセットの設定に関わるオプションです。
+| オプション名 | 設定例 | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `image_dir` | `‘C:\hoge’` | - | - | o |
+| `metadata_file` | `'C:\piyo\piyo_md.json'` | - | - | o（必須） |
+* `image_dir`
+    * 画像ディレクトリのパスを指定します。DreamBooth の手法の方とは異なり指定は必須ではありませんが、設定することを推奨します。
+        * 指定する必要がない状況としては、メタデータファイルの生成時に `--full_path` を付与して実行していた場合です。
+    * 画像はディレクトリ直下に置かれている必要があります。
+* `metadata_file`
+    * サブセットで利用されるメタデータファイルのパスを指定します。指定必須オプションです。
+        * コマンドライン引数の `--in_json` と同等です。
+    * サブセットごとにメタデータファイルを指定する必要がある仕様上、ディレクトリを跨いだメタデータを1つのメタデータファイルとして作成することは避けた方が良いでしょう。画像ディレクトリごとにメタデータファイルを用意し、それらを別々のサブセットとして登録することを強く推奨します。
+### caption dropout の手法が使える場合に指定可能なオプション
+caption dropout の手法が使える場合のオプションは、サブセット向けオプションのみ存在します。
+DreamBooth 方式か fine tuning 方式かに関わらず、caption dropout に対応している学習方法であれば指定可能です。
+#### サブセット向けオプション
+caption dropout が使えるサブセットの設定に関わるオプションです。
+| オプション名 | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- |
+| `caption_dropout_every_n_epochs` | o | o | o |
+| `caption_dropout_rate` | o | o | o |
+| `caption_tag_dropout_rate` | o | o | o |
+## 重複したサブセットが存在する時の挙動
+DreamBooth 方式のデータセットの場合、���の中にある `image_dir` が同一のサブセットは重複していると見なされます。
+fine tuning 方式のデータセットの場合は、その中にある `metadata_file` が同一のサブセットは重複していると見なされます。
+データセット中に重複したサブセットが存在する場合、2個目以降は無視されます。
+一方、異なるデータセットに所属している場合は、重複しているとは見なされません。
+例えば、以下のように同一の `image_dir` を持つサブセットを別々のデータセットに入れた場合には、重複していないと見なします。
+これは、同じ画像でも異なる解像度で学習したい場合に役立ちます。
+```toml
+# 別々のデータセットに存在している場合は重複とは見なされず、両方とも学習に使われる
+[[datasets]]
+resolution = 512
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+[[datasets]]
+resolution = 768
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+```
+## コマンドライン引数との併用
+設定ファイルのオプションの中には、コマンドライン引数のオプションと役割が重複しているものがあります。
+以下に挙げるコマンドライン引数のオプションは、設定ファイルを渡した場合には無視されます。
+* `--train_data_dir`
+* `--reg_data_dir`
+* `--in_json`
+以下に挙げるコマンドライン引数のオプションは、コマンドライン引数と設定ファイルで同時に指定された場合、コマンドライン引数の値よりも設定ファイルの値が優先されます。特に断りがなければ同名のオプションとなります。
+| コマンドライン引数のオプション     | 優先される設定ファイルのオプション |
+| ---------------------------------- | ---------------------------------- |
+| `--bucket_no_upscale`              |                                    |
+| `--bucket_reso_steps`              |                                    |
+| `--caption_dropout_every_n_epochs` |                                    |
+| `--caption_dropout_rate`           |                                    |
+| `--caption_extension`              |                                    |
+| `--caption_tag_dropout_rate`       |                                    |
+| `--color_aug`                      |                                    |
+| `--dataset_repeats`                | `num_repeats`                      |
+| `--enable_bucket`                  |                                    |
+| `--face_crop_aug_range`            |                                    |
+| `--flip_aug`                       |                                    |
+| `--keep_tokens`                    |                                    |
+| `--min_bucket_reso`                |                                    |
+| `--random_crop`                    |                                    |
+| `--resolution`                     |                                    |
+| `--shuffle_caption`                |                                    |
+| `--train_batch_size`               | `batch_size`                       |
+## エラーの手引き
+現在、外部ライブラリを利用して設定ファイルの記述が正しいかどうかをチェックしているのですが、整備が行き届いておらずエラーメッセージがわかりづらいという問題があります。
+将来的にはこの問題の改善に取り組む予定です。
+次善策として、頻出のエラーとその対処法について載せておきます。
+正しいはずなのにエラーが出る場合、エラー内容がどうしても分からない場合は、バグかもしれないのでご連絡ください。
+* `voluptuous.error.MultipleInvalid: required key not provided @ ...`: 指定必須のオプションが指定されていないというエラーです。指定を忘れているか、オプション名を間違って記述している可能性が高いです。
+  * `...` の箇所にはエラーが発生した場所が載っています。例えば `voluptuous.error.MultipleInvalid: required key not provided @ data['datasets'][0]['subsets'][0]['image_dir']` のようなエラーが出たら、0 番目の `datasets` 中の 0 番目の `subsets` の設定に `image_dir` が存在しないということになります。
+* `voluptuous.error.MultipleInvalid: expected int for dictionary value @ ...`: 指定する値の形式が不正というエラーです。値の形式が間違っている可能性が高いです。`int` の部分は対象となるオプションによって変わります。この README に載っているオプションの「設定例」が役立つかもしれません。
+* `voluptuous.error.MultipleInvalid: extra keys not allowed @ ...`: 対応していないオプション名が存在している場合に発生するエラーです。オプション名を間違って��述しているか、誤って紛れ込んでいる可能性が高いです。
+## その他
+### 複数行キャプション
+`enable_wildcard = true` を設定することで、複数行キャプションも同時に有効になります。キャプションファイルが複数の行からなる場合、ランダムに一つの行が選ばれてキャプションとして利用されます。
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, microphone, stage
+a girl with a microphone standing on a stage
+detailed digital art of a girl with a microphone on a stage
+```
+ワイルドカード記法と組み合わせることも可能です。
+メタデータファイルでも同様に複数行キャプションを指定することができます。メタデータの .json 内には、`\n` を使って改行を表現してください。キャプションファイルが複数行からなる場合、`merge_captions_to_metadata.py` を使うと、この形式でメタデータファイルが作成されます。
+メタデータのタグ (`tags`) は、キャプションの各行に追加されます。
+```json
+{
+    "/path/to/image.png": {
+        "caption": "a cartoon of a frog with the word frog on it\ntest multiline caption1\ntest multiline caption2",
+        "tags": "open mouth, simple background, standing, no humans, animal, black background, frog, animal costume, animal focus"
+    },
+    ...
+}
+```
+この場合、実際のキャプションは `a cartoon of a frog with the word frog on it, open mouth, simple background ...` または `test multiline caption1, open mouth, simple background ...`、 `test multiline caption2, open mouth, simple background ...` 等になります。
+### 設定ファイルの記述例：追加の区切り文字、ワイルドカード記法、`keep_tokens_separator` 等
+```toml
+[general]
+flip_aug = true
+color_aug = false
+resolution = [1024, 1024]
+[[datasets]]
+batch_size = 6
+enable_bucket = true
+bucket_no_upscale = true
+caption_extension = ".txt"
+keep_tokens_separator= "|||"
+shuffle_caption = true
+caption_tag_dropout_rate = 0.1
+secondary_separator = ";;;" # subset 側に書くこともできます / can be written in the subset side
+enable_wildcard = true # 同上 / same as above
+  [[datasets.subsets]]
+  image_dir = "/path/to/image_dir"
+  num_repeats = 1
+  # ||| の前後はカンマは不要です（自動的に追加されます） / No comma is required before and after ||| (it is added automatically)
+  caption_prefix = "1girl, hatsune miku, vocaloid |||"
+  # ||| の後はシャッフル、drop されず残ります / After |||, it is not shuffled or dropped and remains
+  # 単純に文字列として連結されるので、カンマなどは自分で入れる必要があります / It is simply concatenated as a string, so you need to put commas yourself
+  caption_suffix = ", anime screencap ||| masterpiece, rating: general"
+```
+### キャプション記述例、secondary_separator 記法：`secondary_separator = ";;;"` の場合
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, sky;;;cloud;;;day, outdoors
+```
+`sky;;;cloud;;;day` の部分はシャッフル、drop されず `sky,cloud,day` に置換されます。シャッフル、drop が有効な場合、まとめて（一つのタグとして）処理されます。つまり `vocaloid, 1girl, upper body, sky,cloud,day, outdoors, hatsune miku` （シャッフル）や `vocaloid, 1girl, outdoors, looking at viewer, upper body, hatsune miku` （drop されたケース）などになります。
+### キャプション記述例、ワイルドカード記法： `enable_wildcard = true` の場合
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, {simple|white} background
+```
+ランダムに `simple` または `white` が選ばれ、`simple background` または `white background` になります。
+```txt
+1girl, hatsune miku, vocaloid, {{retro style}}
+```
+タグ文字列に `{` や `}` そのものを含めたい場合は `{{` や `}}` のように二つ重ねてください（この例では実際に学習に用いられるキャプションは `{retro style}` になります）。
+### キャプション記述例、`keep_tokens_separator` 記法： `keep_tokens_separator = "|||"` の場合
+```txt
+1girl, hatsune miku, vocaloid ||| stage, microphone, white shirt, smile ||| best quality, rating: general
+```
+`1girl, hatsune miku, vocaloid, microphone, stage, white shirt, best quality, rating: general` や `1girl, hatsune miku, vocaloid, white shirt, smile, stage, microphone, best quality, rating: general` などになります。

config_util.py ADDED Viewed

	@@ -0,0 +1,721 @@

+import argparse
+from dataclasses import (
+    asdict,
+    dataclass,
+)
+import functools
+import random
+from textwrap import dedent, indent
+import json
+from pathlib import Path
+# from toolz import curry
+from typing import (
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+import toml
+import voluptuous
+from voluptuous import (
+    Any,
+    ExactSequence,
+    MultipleInvalid,
+    Object,
+    Required,
+    Schema,
+)
+from transformers import CLIPTokenizer
+from . import train_util
+from .train_util import (
+    DreamBoothSubset,
+    FineTuningSubset,
+    ControlNetSubset,
+    DreamBoothDataset,
+    FineTuningDataset,
+    ControlNetDataset,
+    DatasetGroup,
+)
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def add_config_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル"
+    )
+# TODO: inherit Params class in Subset, Dataset
+@dataclass
+class BaseSubsetParams:
+    image_dir: Optional[str] = None
+    num_repeats: int = 1
+    shuffle_caption: bool = False
+    caption_separator: str = (",",)
+    keep_tokens: int = 0
+    keep_tokens_separator: str = (None,)
+    secondary_separator: Optional[str] = None
+    enable_wildcard: bool = False
+    color_aug: bool = False
+    flip_aug: bool = False
+    face_crop_aug_range: Optional[Tuple[float, float]] = None
+    random_crop: bool = False
+    caption_prefix: Optional[str] = None
+    caption_suffix: Optional[str] = None
+    caption_dropout_rate: float = 0.0
+    caption_dropout_every_n_epochs: int = 0
+    caption_tag_dropout_rate: float = 0.0
+    token_warmup_min: int = 1
+    token_warmup_step: float = 0
+@dataclass
+class DreamBoothSubsetParams(BaseSubsetParams):
+    is_reg: bool = False
+    class_tokens: Optional[str] = None
+    caption_extension: str = ".caption"
+    cache_info: bool = False
+    alpha_mask: bool = False
+@dataclass
+class FineTuningSubsetParams(BaseSubsetParams):
+    metadata_file: Optional[str] = None
+    alpha_mask: bool = False
+@dataclass
+class ControlNetSubsetParams(BaseSubsetParams):
+    conditioning_data_dir: str = None
+    caption_extension: str = ".caption"
+    cache_info: bool = False
+@dataclass
+class BaseDatasetParams:
+    tokenizer: Union[CLIPTokenizer, List[CLIPTokenizer]] = None
+    max_token_length: int = None
+    resolution: Optional[Tuple[int, int]] = None
+    network_multiplier: float = 1.0
+    debug_dataset: bool = False
+@dataclass
+class DreamBoothDatasetParams(BaseDatasetParams):
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+    prior_loss_weight: float = 1.0
+@dataclass
+class FineTuningDatasetParams(BaseDatasetParams):
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+@dataclass
+class ControlNetDatasetParams(BaseDatasetParams):
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+@dataclass
+class SubsetBlueprint:
+    params: Union[DreamBoothSubsetParams, FineTuningSubsetParams]
+@dataclass
+class DatasetBlueprint:
+    is_dreambooth: bool
+    is_controlnet: bool
+    params: Union[DreamBoothDatasetParams, FineTuningDatasetParams]
+    subsets: Sequence[SubsetBlueprint]
+@dataclass
+class DatasetGroupBlueprint:
+    datasets: Sequence[DatasetBlueprint]
+@dataclass
+class Blueprint:
+    dataset_group: DatasetGroupBlueprint
+class ConfigSanitizer:
+    # @curry
+    @staticmethod
+    def __validate_and_convert_twodim(klass, value: Sequence) -> Tuple:
+        Schema(ExactSequence([klass, klass]))(value)
+        return tuple(value)
+    # @curry
+    @staticmethod
+    def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]) -> Tuple:
+        Schema(Any(klass, ExactSequence([klass, klass])))(value)
+        try:
+            Schema(klass)(value)
+            return (value, value)
+        except:
+            return ConfigSanitizer.__validate_and_convert_twodim(klass, value)
+    # subset schema
+    SUBSET_ASCENDABLE_SCHEMA = {
+        "color_aug": bool,
+        "face_crop_aug_range": functools.partial(__validate_and_convert_twodim.__func__, float),
+        "flip_aug": bool,
+        "num_repeats": int,
+        "random_crop": bool,
+        "shuffle_caption": bool,
+        "keep_tokens": int,
+        "keep_tokens_separator": str,
+        "secondary_separator": str,
+        "caption_separator": str,
+        "enable_wildcard": bool,
+        "token_warmup_min": int,
+        "token_warmup_step": Any(float, int),
+        "caption_prefix": str,
+        "caption_suffix": str,
+    }
+    # DO means DropOut
+    DO_SUBSET_ASCENDABLE_SCHEMA = {
+        "caption_dropout_every_n_epochs": int,
+        "caption_dropout_rate": Any(float, int),
+        "caption_tag_dropout_rate": Any(float, int),
+    }
+    # DB means DreamBooth
+    DB_SUBSET_ASCENDABLE_SCHEMA = {
+        "caption_extension": str,
+        "class_tokens": str,
+        "cache_info": bool,
+    }
+    DB_SUBSET_DISTINCT_SCHEMA = {
+        Required("image_dir"): str,
+        "is_reg": bool,
+        "alpha_mask": bool,
+    }
+    # FT means FineTuning
+    FT_SUBSET_DISTINCT_SCHEMA = {
+        Required("metadata_file"): str,
+        "image_dir": str,
+        "alpha_mask": bool,
+    }
+    CN_SUBSET_ASCENDABLE_SCHEMA = {
+        "caption_extension": str,
+        "cache_info": bool,
+    }
+    CN_SUBSET_DISTINCT_SCHEMA = {
+        Required("image_dir"): str,
+        Required("conditioning_data_dir"): str,
+    }
+    # datasets schema
+    DATASET_ASCENDABLE_SCHEMA = {
+        "batch_size": int,
+        "bucket_no_upscale": bool,
+        "bucket_reso_steps": int,
+        "enable_bucket": bool,
+        "max_bucket_reso": int,
+        "min_bucket_reso": int,
+        "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
+        "network_multiplier": float,
+    }
+    # options handled by argparse but not handled by user config
+    ARGPARSE_SPECIFIC_SCHEMA = {
+        "debug_dataset": bool,
+        "max_token_length": Any(None, int),
+        "prior_loss_weight": Any(float, int),
+    }
+    # for handling default None value of argparse
+    ARGPARSE_NULLABLE_OPTNAMES = [
+        "face_crop_aug_range",
+        "resolution",
+    ]
+    # prepare map because option name may differ among argparse and user config
+    ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME = {
+        "train_batch_size": "batch_size",
+        "dataset_repeats": "num_repeats",
+    }
+    def __init__(self, support_dreambooth: bool, support_finetuning: bool, support_controlnet: bool, support_dropout: bool) -> None:
+        assert support_dreambooth or support_finetuning or support_controlnet, (
+            "Neither DreamBooth mode nor fine tuning mode nor controlnet mode specified. Please specify one mode or more."
+            + " / DreamBooth モードか fine tuning モードか controlnet モードのどれも指定されていません。1つ以上指定してください。"
+        )
+        self.db_subset_schema = self.__merge_dict(
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DB_SUBSET_DISTINCT_SCHEMA,
+            self.DB_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.ft_subset_schema = self.__merge_dict(
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.FT_SUBSET_DISTINCT_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.cn_subset_schema = self.__merge_dict(
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.CN_SUBSET_DISTINCT_SCHEMA,
+            self.CN_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.db_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DB_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+            {"subsets": [self.db_subset_schema]},
+        )
+        self.ft_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+            {"subsets": [self.ft_subset_schema]},
+        )
+        self.cn_dataset_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.CN_SUBSET_ASCENDABLE_SCHEMA,
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+            {"subsets": [self.cn_subset_schema]},
+        )
+        if support_dreambooth and support_finetuning:
+            def validate_flex_dataset(dataset_config: dict):
+                subsets_config = dataset_config.get("subsets", [])
+                if support_controlnet and all(["conditioning_data_dir" in subset for subset in subsets_config]):
+                    return Schema(self.cn_dataset_schema)(dataset_config)
+                # check dataset meets FT style
+                # NOTE: all FT subsets should have "metadata_file"
+                elif all(["metadata_file" in subset for subset in subsets_config]):
+                    return Schema(self.ft_dataset_schema)(dataset_config)
+                # check dataset meets DB style
+                # NOTE: all DB subsets should have no "metadata_file"
+                elif all(["metadata_file" not in subset for subset in subsets_config]):
+                    return Schema(self.db_dataset_schema)(dataset_config)
+                else:
+                    raise voluptuous.Invalid(
+                        "DreamBooth subset and fine tuning subset cannot be mixed in the same dataset. Please split them into separate datasets. / DreamBoothのサブセットとfine tuninのサブセットを同一のデータセットに混在させることはできません。別々のデータセットに分割してください。"
+                    )
+            self.dataset_schema = validate_flex_dataset
+        elif support_dreambooth:
+            if support_controlnet:
+                self.dataset_schema = self.cn_dataset_schema
+            else:
+                self.dataset_schema = self.db_dataset_schema
+        elif support_finetuning:
+            self.dataset_schema = self.ft_dataset_schema
+        elif support_controlnet:
+            self.dataset_schema = self.cn_dataset_schema
+        self.general_schema = self.__merge_dict(
+            self.DATASET_ASCENDABLE_SCHEMA,
+            self.SUBSET_ASCENDABLE_SCHEMA,
+            self.DB_SUBSET_ASCENDABLE_SCHEMA if support_dreambooth else {},
+            self.CN_SUBSET_ASCENDABLE_SCHEMA if support_controlnet else {},
+            self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+        )
+        self.user_config_validator = Schema(
+            {
+                "general": self.general_schema,
+                "datasets": [self.dataset_schema],
+            }
+        )
+        self.argparse_schema = self.__merge_dict(
+            self.general_schema,
+            self.ARGPARSE_SPECIFIC_SCHEMA,
+            {optname: Any(None, self.general_schema[optname]) for optname in self.ARGPARSE_NULLABLE_OPTNAMES},
+            {a_name: self.general_schema[c_name] for a_name, c_name in self.ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME.items()},
+        )
+        self.argparse_config_validator = Schema(Object(self.argparse_schema), extra=voluptuous.ALLOW_EXTRA)
+    def sanitize_user_config(self, user_config: dict) -> dict:
+        try:
+            return self.user_config_validator(user_config)
+        except MultipleInvalid:
+            # TODO: エラー発生時のメッセージをわかりやすくする
+            logger.error("Invalid user config / ユーザ設定の形式が正しくないようです")
+            raise
+    # NOTE: In nature, argument parser result is not needed to be sanitize
+    #   However this will help us to detect program bug
+    def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) -> argparse.Namespace:
+        try:
+            return self.argparse_config_validator(argparse_namespace)
+        except MultipleInvalid:
+            # XXX: this should be a bug
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
+            raise
+    # NOTE: value would be overwritten by latter dict if there is already the same key
+    @staticmethod
+    def __merge_dict(*dict_list: dict) -> dict:
+        merged = {}
+        for schema in dict_list:
+            # merged |= schema
+            for k, v in schema.items():
+                merged[k] = v
+        return merged
+class BlueprintGenerator:
+    BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME = {}
+    def __init__(self, sanitizer: ConfigSanitizer):
+        self.sanitizer = sanitizer
+    # runtime_params is for parameters which is only configurable on runtime, such as tokenizer
+    def generate(self, user_config: dict, argparse_namespace: argparse.Namespace, **runtime_params) -> Blueprint:
+        sanitized_user_config = self.sanitizer.sanitize_user_config(user_config)
+        sanitized_argparse_namespace = self.sanitizer.sanitize_argparse_namespace(argparse_namespace)
+        # convert argparse namespace to dict like config
+        # NOTE: it is ok to have extra entries in dict
+        optname_map = self.sanitizer.ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME
+        argparse_config = {
+            optname_map.get(optname, optname): value for optname, value in vars(sanitized_argparse_namespace).items()
+        }
+        general_config = sanitized_user_config.get("general", {})
+        dataset_blueprints = []
+        for dataset_config in sanitized_user_config.get("datasets", []):
+            # NOTE: if subsets have no "metadata_file", these are DreamBooth datasets/subsets
+            subsets = dataset_config.get("subsets", [])
+            is_dreambooth = all(["metadata_file" not in subset for subset in subsets])
+            is_controlnet = all(["conditioning_data_dir" in subset for subset in subsets])
+            if is_controlnet:
+                subset_params_klass = ControlNetSubsetParams
+                dataset_params_klass = ControlNetDatasetParams
+            elif is_dreambooth:
+                subset_params_klass = DreamBoothSubsetParams
+                dataset_params_klass = DreamBoothDatasetParams
+            else:
+                subset_params_klass = FineTuningSubsetParams
+                dataset_params_klass = FineTuningDatasetParams
+            subset_blueprints = []
+            for subset_config in subsets:
+                params = self.generate_params_by_fallbacks(
+                    subset_params_klass, [subset_config, dataset_config, general_config, argparse_config, runtime_params]
+                )
+                subset_blueprints.append(SubsetBlueprint(params))
+            params = self.generate_params_by_fallbacks(
+                dataset_params_klass, [dataset_config, general_config, argparse_config, runtime_params]
+            )
+            dataset_blueprints.append(DatasetBlueprint(is_dreambooth, is_controlnet, params, subset_blueprints))
+        dataset_group_blueprint = DatasetGroupBlueprint(dataset_blueprints)
+        return Blueprint(dataset_group_blueprint)
+    @staticmethod
+    def generate_params_by_fallbacks(param_klass, fallbacks: Sequence[dict]):
+        name_map = BlueprintGenerator.BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME
+        search_value = BlueprintGenerator.search_value
+        default_params = asdict(param_klass())
+        param_names = default_params.keys()
+        params = {name: search_value(name_map.get(name, name), fallbacks, default_params.get(name)) for name in param_names}
+        return param_klass(**params)
+    @staticmethod
+    def search_value(key: str, fallbacks: Sequence[dict], default_value=None):
+        for cand in fallbacks:
+            value = cand.get(key)
+            if value is not None:
+                return value
+        return default_value
+def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint):
+    datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
+    for dataset_blueprint in dataset_group_blueprint.datasets:
+        if dataset_blueprint.is_controlnet:
+            subset_klass = ControlNetSubset
+            dataset_klass = ControlNetDataset
+        elif dataset_blueprint.is_dreambooth:
+            subset_klass = DreamBoothSubset
+            dataset_klass = DreamBoothDataset
+        else:
+            subset_klass = FineTuningSubset
+            dataset_klass = FineTuningDataset
+        subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+        dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
+        datasets.append(dataset)
+    # print info
+    info = ""
+    for i, dataset in enumerate(datasets):
+        is_dreambooth = isinstance(dataset, DreamBoothDataset)
+        is_controlnet = isinstance(dataset, ControlNetDataset)
+        info += dedent(
+            f"""\
+      [Dataset {i}]
+        batch_size: {dataset.batch_size}
+        resolution: {(dataset.width, dataset.height)}
+        enable_bucket: {dataset.enable_bucket}
+        network_multiplier: {dataset.network_multiplier}
+    """
+        )
+        if dataset.enable_bucket:
+            info += indent(
+                dedent(
+                    f"""\
+        min_bucket_reso: {dataset.min_bucket_reso}
+        max_bucket_reso: {dataset.max_bucket_reso}
+        bucket_reso_steps: {dataset.bucket_reso_steps}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+      \n"""
+                ),
+                "  ",
+            )
+        else:
+            info += "\n"
+        for j, subset in enumerate(dataset.subsets):
+            info += indent(
+                dedent(
+                    f"""\
+        [Subset {j} of Dataset {i}]
+          image_dir: "{subset.image_dir}"
+          image_count: {subset.img_count}
+          num_repeats: {subset.num_repeats}
+          shuffle_caption: {subset.shuffle_caption}
+          keep_tokens: {subset.keep_tokens}
+          keep_tokens_separator: {subset.keep_tokens_separator}
+          caption_separator: {subset.caption_separator}
+          secondary_separator: {subset.secondary_separator}
+          enable_wildcard: {subset.enable_wildcard}
+          caption_dropout_rate: {subset.caption_dropout_rate}
+          caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
+          caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
+          caption_prefix: {subset.caption_prefix}
+          caption_suffix: {subset.caption_suffix}
+          color_aug: {subset.color_aug}
+          flip_aug: {subset.flip_aug}
+          face_crop_aug_range: {subset.face_crop_aug_range}
+          random_crop: {subset.random_crop}
+          token_warmup_min: {subset.token_warmup_min},
+          token_warmup_step: {subset.token_warmup_step},
+          alpha_mask: {subset.alpha_mask},
+      """
+                ),
+                "  ",
+            )
+            if is_dreambooth:
+                info += indent(
+                    dedent(
+                        f"""\
+          is_reg: {subset.is_reg}
+          class_tokens: {subset.class_tokens}
+          caption_extension: {subset.caption_extension}
+        \n"""
+                    ),
+                    "    ",
+                )
+            elif not is_controlnet:
+                info += indent(
+                    dedent(
+                        f"""\
+          metadata_file: {subset.metadata_file}
+        \n"""
+                    ),
+                    "    ",
+                )
+    logger.info(f"{info}")
+    # make buckets first because it determines the length of dataset
+    # and set the same seed for all datasets
+    seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
+    for i, dataset in enumerate(datasets):
+        logger.info(f"[Dataset {i}]")
+        dataset.make_buckets()
+        dataset.set_seed(seed)
+    return DatasetGroup(datasets)
+def generate_dreambooth_subsets_config_by_subdirs(train_data_dir: Optional[str] = None, reg_data_dir: Optional[str] = None):
+    def extract_dreambooth_params(name: str) -> Tuple[int, str]:
+        tokens = name.split("_")
+        try:
+            n_repeats = int(tokens[0])
+        except ValueError as e:
+            logger.warning(f"ignore directory without repeats / 繰り返し回数のないディレクトリを無視します: {name}")
+            return 0, ""
+        caption_by_folder = "_".join(tokens[1:])
+        return n_repeats, caption_by_folder
+    def generate(base_dir: Optional[str], is_reg: bool):
+        if base_dir is None:
+            return []
+        base_dir: Path = Path(base_dir)
+        if not base_dir.is_dir():
+            return []
+        subsets_config = []
+        for subdir in base_dir.iterdir():
+            if not subdir.is_dir():
+                continue
+            num_repeats, class_tokens = extract_dreambooth_params(subdir.name)
+            if num_repeats < 1:
+                continue
+            subset_config = {"image_dir": str(subdir), "num_repeats": num_repeats, "is_reg": is_reg, "class_tokens": class_tokens}
+            subsets_config.append(subset_config)
+        return subsets_config
+    subsets_config = []
+    subsets_config += generate(train_data_dir, False)
+    subsets_config += generate(reg_data_dir, True)
+    return subsets_config
+def generate_controlnet_subsets_config_by_subdirs(
+    train_data_dir: Optional[str] = None, conditioning_data_dir: Optional[str] = None, caption_extension: str = ".txt"
+):
+    def generate(base_dir: Optional[str]):
+        if base_dir is None:
+            return []
+        base_dir: Path = Path(base_dir)
+        if not base_dir.is_dir():
+            return []
+        subsets_config = []
+        subset_config = {
+            "image_dir": train_data_dir,
+            "conditioning_data_dir": conditioning_data_dir,
+            "caption_extension": caption_extension,
+            "num_repeats": 1,
+        }
+        subsets_config.append(subset_config)
+        return subsets_config
+    subsets_config = []
+    subsets_config += generate(train_data_dir)
+    return subsets_config
+def load_user_config(file: str) -> dict:
+    file: Path = Path(file)
+    if not file.is_file():
+        raise ValueError(f"file not found / ファイルが見つかりません: {file}")
+    if file.name.lower().endswith(".json"):
+        try:
+            with open(file, "r") as f:
+                config = json.load(f)
+        except Exception:
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    elif file.name.lower().endswith(".toml"):
+        try:
+            config = toml.load(file)
+        except Exception:
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
+            raise
+    else:
+        raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
+    return config
+# for config test
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--support_dreambooth", action="store_true")
+    parser.add_argument("--support_finetuning", action="store_true")
+    parser.add_argument("--support_controlnet", action="store_true")
+    parser.add_argument("--support_dropout", action="store_true")
+    parser.add_argument("dataset_config")
+    config_args, remain = parser.parse_known_args()
+    parser = argparse.ArgumentParser()
+    train_util.add_dataset_arguments(
+        parser, config_args.support_dreambooth, config_args.support_finetuning, config_args.support_dropout
+    )
+    train_util.add_training_arguments(parser, config_args.support_dreambooth)
+    argparse_namespace = parser.parse_args(remain)
+    train_util.prepare_dataset_args(argparse_namespace, config_args.support_finetuning)
+    logger.info("[argparse_namespace]")
+    logger.info(f"{vars(argparse_namespace)}")
+    user_config = load_user_config(config_args.dataset_config)
+    logger.info("")
+    logger.info("[user_config]")
+    logger.info(f"{user_config}")
+    sanitizer = ConfigSanitizer(
+        config_args.support_dreambooth, config_args.support_finetuning, config_args.support_controlnet, config_args.support_dropout
+    )
+    sanitized_user_config = sanitizer.sanitize_user_config(user_config)
+    logger.info("")
+    logger.info("[sanitized_user_config]")
+    logger.info(f"{sanitized_user_config}")
+    blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
+    logger.info("")
+    logger.info("[blueprint]")
+    logger.info(f"{blueprint}")

control_net_lllite.py ADDED Viewed

	@@ -0,0 +1,449 @@

+import os
+from typing import Optional, List, Type
+import torch
+from library import sdxl_original_unet
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+# input_blocksに適用するかどうか / if True, input_blocks are not applied
+SKIP_INPUT_BLOCKS = False
+# output_blocksに適用するかどうか / if True, output_blocks are not applied
+SKIP_OUTPUT_BLOCKS = True
+# conv2dに適用するかどうか / if True, conv2d are not applied
+SKIP_CONV2D = False
+# transformer_blocksのみに適用するかどうか。Trueの場合、ResBlockには適用されない
+# if True, only transformer_blocks are applied, and ResBlocks are not applied
+TRANSFORMER_ONLY = True  # if True, SKIP_CONV2D is ignored because conv2d is not used in transformer_blocks
+# Trueならattn1とattn2にのみ適用し、ffなどには適用しない / if True, apply only to attn1 and attn2, not to ff etc.
+ATTN1_2_ONLY = True
+# Trueならattn1のQKV、attn2のQにのみ適用する、ATTN1_2_ONLY指定時のみ有効 / if True, apply only to attn1 QKV and attn2 Q, only valid when ATTN1_2_ONLY is specified
+ATTN_QKV_ONLY = True
+# Trueならattn1やffなどにのみ適用し、attn2などには適用しない / if True, apply only to attn1 and ff, not to attn2
+# ATTN1_2_ONLYと同時にTrueにできない / cannot be True at the same time as ATTN1_2_ONLY
+ATTN1_ETC_ONLY = False  # True
+# transformer_blocksの最大インデックス。Noneなら全てのtransformer_blocksに適用
+# max index of transformer_blocks. if None, apply to all transformer_blocks
+TRANSFORMER_MAX_BLOCK_INDEX = None
+class LLLiteModule(torch.nn.Module):
+    def __init__(self, depth, cond_emb_dim, name, org_module, mlp_dim, dropout=None, multiplier=1.0):
+        super().__init__()
+        self.is_conv2d = org_module.__class__.__name__ == "Conv2d"
+        self.lllite_name = name
+        self.cond_emb_dim = cond_emb_dim
+        self.org_module = [org_module]
+        self.dropout = dropout
+        self.multiplier = multiplier
+        if self.is_conv2d:
+            in_dim = org_module.in_channels
+        else:
+            in_dim = org_module.in_features
+        # conditioning1はconditioning imageを embedding する。timestepごとに呼ばれない
+        # conditioning1 embeds conditioning image. it is not called for each timestep
+        modules = []
+        modules.append(torch.nn.Conv2d(3, cond_emb_dim // 2, kernel_size=4, stride=4, padding=0))  # to latent (from VAE) size
+        if depth == 1:
+            modules.append(torch.nn.ReLU(inplace=True))
+            modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim, kernel_size=2, stride=2, padding=0))
+        elif depth == 2:
+            modules.append(torch.nn.ReLU(inplace=True))
+            modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim, kernel_size=4, stride=4, padding=0))
+        elif depth == 3:
+            # kernel size 8は大きすぎるので、4にする / kernel size 8 is too large, so set it to 4
+            modules.append(torch.nn.ReLU(inplace=True))
+            modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim // 2, kernel_size=4, stride=4, padding=0))
+            modules.append(torch.nn.ReLU(inplace=True))
+            modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim, kernel_size=2, stride=2, padding=0))
+        self.conditioning1 = torch.nn.Sequential(*modules)
+        # downで入力の次元数を削減する。LoRAにヒントを得ていることにする
+        # midでconditioning image embeddingと入力を結合する
+        # upで元の次元数に戻す
+        # これらはtimestepごとに呼ばれる
+        # reduce the number of input dimensions with down. inspired by LoRA
+        # combine conditioning image embedding and input with mid
+        # restore to the original dimension with up
+        # these are called for each timestep
+        if self.is_conv2d:
+            self.down = torch.nn.Sequential(
+                torch.nn.Conv2d(in_dim, mlp_dim, kernel_size=1, stride=1, padding=0),
+                torch.nn.ReLU(inplace=True),
+            )
+            self.mid = torch.nn.Sequential(
+                torch.nn.Conv2d(mlp_dim + cond_emb_dim, mlp_dim, kernel_size=1, stride=1, padding=0),
+                torch.nn.ReLU(inplace=True),
+            )
+            self.up = torch.nn.Sequential(
+                torch.nn.Conv2d(mlp_dim, in_dim, kernel_size=1, stride=1, padding=0),
+            )
+        else:
+            # midの前にconditioningをreshapeすること / reshape conditioning before mid
+            self.down = torch.nn.Sequential(
+                torch.nn.Linear(in_dim, mlp_dim),
+                torch.nn.ReLU(inplace=True),
+            )
+            self.mid = torch.nn.Sequential(
+                torch.nn.Linear(mlp_dim + cond_emb_dim, mlp_dim),
+                torch.nn.ReLU(inplace=True),
+            )
+            self.up = torch.nn.Sequential(
+                torch.nn.Linear(mlp_dim, in_dim),
+            )
+        # Zero-Convにする / set to Zero-Conv
+        torch.nn.init.zeros_(self.up[0].weight)  # zero conv
+        self.depth = depth  # 1~3
+        self.cond_emb = None
+        self.batch_cond_only = False  # Trueなら推論時のcondにのみ適用する / if True, apply only to cond at inference
+        self.use_zeros_for_batch_uncond = False  # Trueならuncondのconditioningを0にする / if True, set uncond conditioning to 0
+        # batch_cond_onlyとuse_zeros_for_batch_uncondはどちらも適用すると生成画像の色味がおかしくなるので実際には使えそうにない
+        # Controlの種類によっては使えるかも
+        # both batch_cond_only and use_zeros_for_batch_uncond make the color of the generated image strange, so it doesn't seem to be usable in practice
+        # it may be available depending on the type of Control
+    def set_cond_image(self, cond_image):
+        r"""
+        中でモデルを呼び出すので必要ならwith torch.no_grad()で囲む
+        / call the model inside, so if necessary, surround it with torch.no_grad()
+        """
+        if cond_image is None:
+            self.cond_emb = None
+            return
+        # timestepごとに呼ばれないので、あらかじめ計算しておく / it is not called for each timestep, so calculate it in advance
+        # logger.info(f"C {self.lllite_name}, cond_image.shape={cond_image.shape}")
+        cx = self.conditioning1(cond_image)
+        if not self.is_conv2d:
+            # reshape / b,c,h,w -> b,h*w,c
+            n, c, h, w = cx.shape
+            cx = cx.view(n, c, h * w).permute(0, 2, 1)
+        self.cond_emb = cx
+    def set_batch_cond_only(self, cond_only, zeros):
+        self.batch_cond_only = cond_only
+        self.use_zeros_for_batch_uncond = zeros
+    def apply_to(self):
+        self.org_forward = self.org_module[0].forward
+        self.org_module[0].forward = self.forward
+    def forward(self, x):
+        r"""
+        学習用の便利forward。元のモジュールのforwardを呼び出す
+        / convenient forward for training. call the forward of the original module
+        """
+        if self.multiplier == 0.0 or self.cond_emb is None:
+            return self.org_forward(x)
+        cx = self.cond_emb
+        if not self.batch_cond_only and x.shape[0] // 2 == cx.shape[0]:  # inference only
+            cx = cx.repeat(2, 1, 1, 1) if self.is_conv2d else cx.repeat(2, 1, 1)
+            if self.use_zeros_for_batch_uncond:
+                cx[0::2] = 0.0  # uncond is zero
+        # logger.info(f"C {self.lllite_name}, x.shape={x.shape}, cx.shape={cx.shape}")
+        # downで入力の次元数を削減し、conditioning image embeddingと結合する
+        # 加算ではなくchannel方向に結合することで、うまいこと混ぜてくれることを期待している
+        # down reduces the number of input dimensions and combines it with conditioning image embedding
+        # we expect that it will mix well by combining in the channel direction instead of adding
+        cx = torch.cat([cx, self.down(x if not self.batch_cond_only else x[1::2])], dim=1 if self.is_conv2d else 2)
+        cx = self.mid(cx)
+        if self.dropout is not None and self.training:
+            cx = torch.nn.functional.dropout(cx, p=self.dropout)
+        cx = self.up(cx) * self.multiplier
+        # residual (x) を加算して元のforwardを呼び出す / add residual (x) and call the original forward
+        if self.batch_cond_only:
+            zx = torch.zeros_like(x)
+            zx[1::2] += cx
+            cx = zx
+        x = self.org_forward(x + cx)  # ここで元のモジュールを呼び出す / call the original module here
+        return x
+class ControlNetLLLite(torch.nn.Module):
+    UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel"]
+    UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
+    def __init__(
+        self,
+        unet: sdxl_original_unet.SdxlUNet2DConditionModel,
+        cond_emb_dim: int = 16,
+        mlp_dim: int = 16,
+        dropout: Optional[float] = None,
+        varbose: Optional[bool] = False,
+        multiplier: Optional[float] = 1.0,
+    ) -> None:
+        super().__init__()
+        # self.unets = [unet]
+        def create_modules(
+            root_module: torch.nn.Module,
+            target_replace_modules: List[torch.nn.Module],
+            module_class: Type[object],
+        ) -> List[torch.nn.Module]:
+            prefix = "lllite_unet"
+            modules = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        if is_linear or (is_conv2d and not SKIP_CONV2D):
+                            # block indexからdepthを計算: depthはconditioningのサイズやチャネルを計算するのに使う
+                            # block index to depth: depth is using to calculate conditioning size and channels
+                            block_name, index1, index2 = (name + "." + child_name).split(".")[:3]
+                            index1 = int(index1)
+                            if block_name == "input_blocks":
+                                if SKIP_INPUT_BLOCKS:
+                                    continue
+                                depth = 1 if index1 <= 2 else (2 if index1 <= 5 else 3)
+                            elif block_name == "middle_block":
+                                depth = 3
+                            elif block_name == "output_blocks":
+                                if SKIP_OUTPUT_BLOCKS:
+                                    continue
+                                depth = 3 if index1 <= 2 else (2 if index1 <= 5 else 1)
+                                if int(index2) >= 2:
+                                    depth -= 1
+                            else:
+                                raise NotImplementedError()
+                            lllite_name = prefix + "." + name + "." + child_name
+                            lllite_name = lllite_name.replace(".", "_")
+                            if TRANSFORMER_MAX_BLOCK_INDEX is not None:
+                                p = lllite_name.find("transformer_blocks")
+                                if p >= 0:
+                                    tf_index = int(lllite_name[p:].split("_")[2])
+                                    if tf_index > TRANSFORMER_MAX_BLOCK_INDEX:
+                                        continue
+                            #  time embは適用外とする
+                            # attn2のconditioning (CLIPからの入力) はshapeが違うので適用できない
+                            # time emb is not applied
+                            # attn2 conditioning (input from CLIP) cannot be applied because the shape is different
+                            if "emb_layers" in lllite_name or (
+                                "attn2" in lllite_name and ("to_k" in lllite_name or "to_v" in lllite_name)
+                            ):
+                                continue
+                            if ATTN1_2_ONLY:
+                                if not ("attn1" in lllite_name or "attn2" in lllite_name):
+                                    continue
+                                if ATTN_QKV_ONLY:
+                                    if "to_out" in lllite_name:
+                                        continue
+                            if ATTN1_ETC_ONLY:
+                                if "proj_out" in lllite_name:
+                                    pass
+                                elif "attn1" in lllite_name and (
+                                    "to_k" in lllite_name or "to_v" in lllite_name or "to_out" in lllite_name
+                                ):
+                                    pass
+                                elif "ff_net_2" in lllite_name:
+                                    pass
+                                else:
+                                    continue
+                            module = module_class(
+                                depth,
+                                cond_emb_dim,
+                                lllite_name,
+                                child_module,
+                                mlp_dim,
+                                dropout=dropout,
+                                multiplier=multiplier,
+                            )
+                            modules.append(module)
+            return modules
+        target_modules = ControlNetLLLite.UNET_TARGET_REPLACE_MODULE
+        if not TRANSFORMER_ONLY:
+            target_modules = target_modules + ControlNetLLLite.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
+        # create module instances
+        self.unet_modules: List[LLLiteModule] = create_modules(unet, target_modules, LLLiteModule)
+        logger.info(f"create ControlNet LLLite for U-Net: {len(self.unet_modules)} modules.")
+    def forward(self, x):
+        return x  # dummy
+    def set_cond_image(self, cond_image):
+        r"""
+        中でモデルを呼び出すので必要ならwith torch.no_grad()で囲む
+        / call the model inside, so if necessary, surround it with torch.no_grad()
+        """
+        for module in self.unet_modules:
+            module.set_cond_image(cond_image)
+    def set_batch_cond_only(self, cond_only, zeros):
+        for module in self.unet_modules:
+            module.set_batch_cond_only(cond_only, zeros)
+    def set_multiplier(self, multiplier):
+        for module in self.unet_modules:
+            module.multiplier = multiplier
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+        info = self.load_state_dict(weights_sd, False)
+        return info
+    def apply_to(self):
+        logger.info("applying LLLite for U-Net...")
+        for module in self.unet_modules:
+            module.apply_to()
+            self.add_module(module.lllite_name, module)
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return False
+    def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
+        raise NotImplementedError()
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+    def prepare_optimizer_params(self):
+        self.requires_grad_(True)
+        return self.parameters()
+    def prepare_grad_etc(self):
+        self.requires_grad_(True)
+    def on_epoch_start(self):
+        self.train()
+    def get_trainable_params(self):
+        return self.parameters()
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+        state_dict = self.state_dict()
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+if __name__ == "__main__":
+    # デバッグ用 / for debug
+    # sdxl_original_unet.USE_REENTRANT = False
+    # test shape etc
+    logger.info("create unet")
+    unet = sdxl_original_unet.SdxlUNet2DConditionModel()
+    unet.to("cuda").to(torch.float16)
+    logger.info("create ControlNet-LLLite")
+    control_net = ControlNetLLLite(unet, 32, 64)
+    control_net.apply_to()
+    control_net.to("cuda")
+    logger.info(control_net)
+    # logger.info number of parameters
+    logger.info(f"number of parameters {sum(p.numel() for p in control_net.parameters() if p.requires_grad)}")
+    input()
+    unet.set_use_memory_efficient_attention(True, False)
+    unet.set_gradient_checkpointing(True)
+    unet.train()  # for gradient checkpointing
+    control_net.train()
+    # # visualize
+    # import torchviz
+    # logger.info("run visualize")
+    # controlnet.set_control(conditioning_image)
+    # output = unet(x, t, ctx, y)
+    # logger.info("make_dot")
+    # image = torchviz.make_dot(output, params=dict(controlnet.named_parameters()))
+    # logger.info("render")
+    # image.format = "svg" # "png"
+    # image.render("NeuralNet") # すごく時間がかかるので注意 / be careful because it takes a long time
+    # input()
+    import bitsandbytes
+    optimizer = bitsandbytes.adam.Adam8bit(control_net.prepare_optimizer_params(), 1e-3)
+    scaler = torch.cuda.amp.GradScaler(enabled=True)
+    logger.info("start training")
+    steps = 10
+    sample_param = [p for p in control_net.named_parameters() if "up" in p[0]][0]
+    for step in range(steps):
+        logger.info(f"step {step}")
+        batch_size = 1
+        conditioning_image = torch.rand(batch_size, 3, 1024, 1024).cuda() * 2.0 - 1.0
+        x = torch.randn(batch_size, 4, 128, 128).cuda()
+        t = torch.randint(low=0, high=10, size=(batch_size,)).cuda()
+        ctx = torch.randn(batch_size, 77, 2048).cuda()
+        y = torch.randn(batch_size, sdxl_original_unet.ADM_IN_CHANNELS).cuda()
+        with torch.cuda.amp.autocast(enabled=True):
+            control_net.set_cond_image(conditioning_image)
+            output = unet(x, t, ctx, y)
+            target = torch.randn_like(output)
+            loss = torch.nn.functional.mse_loss(output, target)
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+        optimizer.zero_grad(set_to_none=True)
+        logger.info(f"{sample_param}")
+    # from safetensors.torch import save_file
+    # save_file(control_net.state_dict(), "logs/control_net.safetensors")

control_net_lllite_for_train.py ADDED Viewed

	@@ -0,0 +1,501 @@

+# cond_imageをU-Netのforwardで渡すバージョンのControlNet-LLLite検証用実装
+# ControlNet-LLLite implementation for verification with cond_image passed in U-Net's forward
+import os
+import re
+from typing import Optional, List, Type
+import torch
+from library import sdxl_original_unet
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+# input_blocksに適用するかどうか / if True, input_blocks are not applied
+SKIP_INPUT_BLOCKS = False
+# output_blocksに適用するかどうか / if True, output_blocks are not applied
+SKIP_OUTPUT_BLOCKS = True
+# conv2dに適用するかどうか / if True, conv2d are not applied
+SKIP_CONV2D = False
+# transformer_blocksのみに適用するかどうか。Trueの場合、ResBlockには適用されない
+# if True, only transformer_blocks are applied, and ResBlocks are not applied
+TRANSFORMER_ONLY = True  # if True, SKIP_CONV2D is ignored because conv2d is not used in transformer_blocks
+# Trueならattn1とattn2にのみ適用し、ffなどには適用しない / if True, apply only to attn1 and attn2, not to ff etc.
+ATTN1_2_ONLY = True
+# Trueならattn1のQKV、attn2のQにのみ適用する、ATTN1_2_ONLY指定時のみ有効 / if True, apply only to attn1 QKV and attn2 Q, only valid when ATTN1_2_ONLY is specified
+ATTN_QKV_ONLY = True
+# Trueならattn1やffなどにのみ適用し、attn2などには適用しない / if True, apply only to attn1 and ff, not to attn2
+# ATTN1_2_ONLYと同時にTrueにできない / cannot be True at the same time as ATTN1_2_ONLY
+ATTN1_ETC_ONLY = False  # True
+# transformer_blocksの最大インデックス。Noneなら全てのtransformer_blocksに適用
+# max index of transformer_blocks. if None, apply to all transformer_blocks
+TRANSFORMER_MAX_BLOCK_INDEX = None
+ORIGINAL_LINEAR = torch.nn.Linear
+ORIGINAL_CONV2D = torch.nn.Conv2d
+def add_lllite_modules(module: torch.nn.Module, in_dim: int, depth, cond_emb_dim, mlp_dim) -> None:
+    # conditioning1はconditioning imageを embedding する。timestepごとに呼ばれない
+    # conditioning1 embeds conditioning image. it is not called for each timestep
+    modules = []
+    modules.append(ORIGINAL_CONV2D(3, cond_emb_dim // 2, kernel_size=4, stride=4, padding=0))  # to latent (from VAE) size
+    if depth == 1:
+        modules.append(torch.nn.ReLU(inplace=True))
+        modules.append(ORIGINAL_CONV2D(cond_emb_dim // 2, cond_emb_dim, kernel_size=2, stride=2, padding=0))
+    elif depth == 2:
+        modules.append(torch.nn.ReLU(inplace=True))
+        modules.append(ORIGINAL_CONV2D(cond_emb_dim // 2, cond_emb_dim, kernel_size=4, stride=4, padding=0))
+    elif depth == 3:
+        # kernel size 8は大きすぎるので、4にする / kernel size 8 is too large, so set it to 4
+        modules.append(torch.nn.ReLU(inplace=True))
+        modules.append(ORIGINAL_CONV2D(cond_emb_dim // 2, cond_emb_dim // 2, kernel_size=4, stride=4, padding=0))
+        modules.append(torch.nn.ReLU(inplace=True))
+        modules.append(ORIGINAL_CONV2D(cond_emb_dim // 2, cond_emb_dim, kernel_size=2, stride=2, padding=0))
+    module.lllite_conditioning1 = torch.nn.Sequential(*modules)
+    # downで入力の次元数を削減する。LoRAにヒントを得ていることにする
+    # midでconditioning image embeddingと入力を結合する
+    # upで元の次元数に戻す
+    # これらはtimestepごとに呼ばれる
+    # reduce the number of input dimensions with down. inspired by LoRA
+    # combine conditioning image embedding and input with mid
+    # restore to the original dimension with up
+    # these are called for each timestep
+    module.lllite_down = torch.nn.Sequential(
+        ORIGINAL_LINEAR(in_dim, mlp_dim),
+        torch.nn.ReLU(inplace=True),
+    )
+    module.lllite_mid = torch.nn.Sequential(
+        ORIGINAL_LINEAR(mlp_dim + cond_emb_dim, mlp_dim),
+        torch.nn.ReLU(inplace=True),
+    )
+    module.lllite_up = torch.nn.Sequential(
+        ORIGINAL_LINEAR(mlp_dim, in_dim),
+    )
+    # Zero-Convにする / set to Zero-Conv
+    torch.nn.init.zeros_(module.lllite_up[0].weight)  # zero conv
+class LLLiteLinear(ORIGINAL_LINEAR):
+    def __init__(self, in_features: int, out_features: int, **kwargs):
+        super().__init__(in_features, out_features, **kwargs)
+        self.enabled = False
+    def set_lllite(self, depth, cond_emb_dim, name, mlp_dim, dropout=None, multiplier=1.0):
+        self.enabled = True
+        self.lllite_name = name
+        self.cond_emb_dim = cond_emb_dim
+        self.dropout = dropout
+        self.multiplier = multiplier  # ignored
+        in_dim = self.in_features
+        add_lllite_modules(self, in_dim, depth, cond_emb_dim, mlp_dim)
+        self.cond_image = None
+    def set_cond_image(self, cond_image):
+        self.cond_image = cond_image
+    def forward(self, x):
+        if not self.enabled:
+            return super().forward(x)
+        cx = self.lllite_conditioning1(self.cond_image)  # make forward and backward compatible
+        # reshape / b,c,h,w -> b,h*w,c
+        n, c, h, w = cx.shape
+        cx = cx.view(n, c, h * w).permute(0, 2, 1)
+        cx = torch.cat([cx, self.lllite_down(x)], dim=2)
+        cx = self.lllite_mid(cx)
+        if self.dropout is not None and self.training:
+            cx = torch.nn.functional.dropout(cx, p=self.dropout)
+        cx = self.lllite_up(cx) * self.multiplier
+        x = super().forward(x + cx)  # ここで元のモジュールを呼び出す / call the original module here
+        return x
+class LLLiteConv2d(ORIGINAL_CONV2D):
+    def __init__(self, in_channels: int, out_channels: int, kernel_size, **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
+        self.enabled = False
+    def set_lllite(self, depth, cond_emb_dim, name, mlp_dim, dropout=None, multiplier=1.0):
+        self.enabled = True
+        self.lllite_name = name
+        self.cond_emb_dim = cond_emb_dim
+        self.dropout = dropout
+        self.multiplier = multiplier  # ignored
+        in_dim = self.in_channels
+        add_lllite_modules(self, in_dim, depth, cond_emb_dim, mlp_dim)
+        self.cond_image = None
+        self.cond_emb = None
+    def set_cond_image(self, cond_image):
+        self.cond_image = cond_image
+        self.cond_emb = None
+    def forward(self, x):  # , cond_image=None):
+        if not self.enabled:
+            return super().forward(x)
+        cx = self.lllite_conditioning1(self.cond_image)
+        cx = torch.cat([cx, self.down(x)], dim=1)
+        cx = self.mid(cx)
+        if self.dropout is not None and self.training:
+            cx = torch.nn.functional.dropout(cx, p=self.dropout)
+        cx = self.up(cx) * self.multiplier
+        x = super().forward(x + cx)  # ここで元のモジュールを呼び出す / call the original module here
+        return x
+class SdxlUNet2DConditionModelControlNetLLLite(sdxl_original_unet.SdxlUNet2DConditionModel):
+    UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel"]
+    UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
+    LLLITE_PREFIX = "lllite_unet"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def apply_lllite(
+        self,
+        cond_emb_dim: int = 16,
+        mlp_dim: int = 16,
+        dropout: Optional[float] = None,
+        varbose: Optional[bool] = False,
+        multiplier: Optional[float] = 1.0,
+    ) -> None:
+        def apply_to_modules(
+            root_module: torch.nn.Module,
+            target_replace_modules: List[torch.nn.Module],
+        ) -> List[torch.nn.Module]:
+            prefix = "lllite_unet"
+            modules = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "LLLiteLinear"
+                        is_conv2d = child_module.__class__.__name__ == "LLLiteConv2d"
+                        if is_linear or (is_conv2d and not SKIP_CONV2D):
+                            # block indexからdepthを計算: depthはconditioningのサイズやチャネルを計算するのに使う
+                            # block index to depth: depth is using to calculate conditioning size and channels
+                            block_name, index1, index2 = (name + "." + child_name).split(".")[:3]
+                            index1 = int(index1)
+                            if block_name == "input_blocks":
+                                if SKIP_INPUT_BLOCKS:
+                                    continue
+                                depth = 1 if index1 <= 2 else (2 if index1 <= 5 else 3)
+                            elif block_name == "middle_block":
+                                depth = 3
+                            elif block_name == "output_blocks":
+                                if SKIP_OUTPUT_BLOCKS:
+                                    continue
+                                depth = 3 if index1 <= 2 else (2 if index1 <= 5 else 1)
+                                if int(index2) >= 2:
+                                    depth -= 1
+                            else:
+                                raise NotImplementedError()
+                            lllite_name = prefix + "." + name + "." + child_name
+                            lllite_name = lllite_name.replace(".", "_")
+                            if TRANSFORMER_MAX_BLOCK_INDEX is not None:
+                                p = lllite_name.find("transformer_blocks")
+                                if p >= 0:
+                                    tf_index = int(lllite_name[p:].split("_")[2])
+                                    if tf_index > TRANSFORMER_MAX_BLOCK_INDEX:
+                                        continue
+                            #  time embは適用外とする
+                            # attn2のconditioning (CLIPからの入力) はshapeが違うので適用できない
+                            # time emb is not applied
+                            # attn2 conditioning (input from CLIP) cannot be applied because the shape is different
+                            if "emb_layers" in lllite_name or (
+                                "attn2" in lllite_name and ("to_k" in lllite_name or "to_v" in lllite_name)
+                            ):
+                                continue
+                            if ATTN1_2_ONLY:
+                                if not ("attn1" in lllite_name or "attn2" in lllite_name):
+                                    continue
+                                if ATTN_QKV_ONLY:
+                                    if "to_out" in lllite_name:
+                                        continue
+                            if ATTN1_ETC_ONLY:
+                                if "proj_out" in lllite_name:
+                                    pass
+                                elif "attn1" in lllite_name and (
+                                    "to_k" in lllite_name or "to_v" in lllite_name or "to_out" in lllite_name
+                                ):
+                                    pass
+                                elif "ff_net_2" in lllite_name:
+                                    pass
+                                else:
+                                    continue
+                            child_module.set_lllite(depth, cond_emb_dim, lllite_name, mlp_dim, dropout, multiplier)
+                            modules.append(child_module)
+            return modules
+        target_modules = SdxlUNet2DConditionModelControlNetLLLite.UNET_TARGET_REPLACE_MODULE
+        if not TRANSFORMER_ONLY:
+            target_modules = target_modules + SdxlUNet2DConditionModelControlNetLLLite.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
+        # create module instances
+        self.lllite_modules = apply_to_modules(self, target_modules)
+        logger.info(f"enable ControlNet LLLite for U-Net: {len(self.lllite_modules)} modules.")
+    # def prepare_optimizer_params(self):
+    def prepare_params(self):
+        train_params = []
+        non_train_params = []
+        for name, p in self.named_parameters():
+            if "lllite" in name:
+                train_params.append(p)
+            else:
+                non_train_params.append(p)
+        logger.info(f"count of trainable parameters: {len(train_params)}")
+        logger.info(f"count of non-trainable parameters: {len(non_train_params)}")
+        for p in non_train_params:
+            p.requires_grad_(False)
+        # without this, an error occurs in the optimizer
+        #       RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
+        non_train_params[0].requires_grad_(True)
+        for p in train_params:
+            p.requires_grad_(True)
+        return train_params
+    # def prepare_grad_etc(self):
+    #     self.requires_grad_(True)
+    # def on_epoch_start(self):
+    #     self.train()
+    def get_trainable_params(self):
+        return [p[1] for p in self.named_parameters() if "lllite" in p[0]]
+    def save_lllite_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+        org_state_dict = self.state_dict()
+        # copy LLLite keys from org_state_dict to state_dict with key conversion
+        state_dict = {}
+        for key in org_state_dict.keys():
+            # split with ".lllite"
+            pos = key.find(".lllite")
+            if pos < 0:
+                continue
+            lllite_key = SdxlUNet2DConditionModelControlNetLLLite.LLLITE_PREFIX + "." + key[:pos]
+            lllite_key = lllite_key.replace(".", "_") + key[pos:]
+            lllite_key = lllite_key.replace(".lllite_", ".")
+            state_dict[lllite_key] = org_state_dict[key]
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+    def load_lllite_weights(self, file, non_lllite_unet_sd=None):
+        r"""
+        LLLiteの重みを読み込まない（initされた値を使う）場合はfileにNoneを指定する。
+        この場合、non_lllite_unet_sdにはU-Netのstate_dictを指定する。
+        If you do not want to load LLLite weights (use initialized values), specify None for file.
+        In this case, specify the state_dict of U-Net for non_lllite_unet_sd.
+        """
+        if not file:
+            state_dict = self.state_dict()
+            for key in non_lllite_unet_sd:
+                if key in state_dict:
+                    state_dict[key] = non_lllite_unet_sd[key]
+            info = self.load_state_dict(state_dict, False)
+            return info
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+        # module_name = module_name.replace("_block", "@blocks")
+        # module_name = module_name.replace("_layer", "@layer")
+        # module_name = module_name.replace("to_", "to@")
+        # module_name = module_name.replace("time_embed", "time@embed")
+        # module_name = module_name.replace("label_emb", "label@emb")
+        # module_name = module_name.replace("skip_connection", "skip@connection")
+        # module_name = module_name.replace("proj_in", "proj@in")
+        # module_name = module_name.replace("proj_out", "proj@out")
+        pattern = re.compile(r"(_block|_layer|to_|time_embed|label_emb|skip_connection|proj_in|proj_out)")
+        # convert to lllite with U-Net state dict
+        state_dict = non_lllite_unet_sd.copy() if non_lllite_unet_sd is not None else {}
+        for key in weights_sd.keys():
+            # split with "."
+            pos = key.find(".")
+            if pos < 0:
+                continue
+            module_name = key[:pos]
+            weight_name = key[pos + 1 :]  # exclude "."
+            module_name = module_name.replace(SdxlUNet2DConditionModelControlNetLLLite.LLLITE_PREFIX + "_", "")
+            # これはうまくいかない。逆変換を考えなかった設計が悪い / this does not work well. bad design because I didn't think about inverse conversion
+            # module_name = module_name.replace("_", ".")
+            # ださいけどSDXLのU-Netの "_" を "@" に変換する / ugly but convert "_" of SDXL U-Net to "@"
+            matches = pattern.findall(module_name)
+            if matches is not None:
+                for m in matches:
+                    logger.info(f"{module_name} {m}")
+                    module_name = module_name.replace(m, m.replace("_", "@"))
+            module_name = module_name.replace("_", ".")
+            module_name = module_name.replace("@", "_")
+            lllite_key = module_name + ".lllite_" + weight_name
+            state_dict[lllite_key] = weights_sd[key]
+        info = self.load_state_dict(state_dict, False)
+        return info
+    def forward(self, x, timesteps=None, context=None, y=None, cond_image=None, **kwargs):
+        for m in self.lllite_modules:
+            m.set_cond_image(cond_image)
+        return super().forward(x, timesteps, context, y, **kwargs)
+def replace_unet_linear_and_conv2d():
+    logger.info("replace torch.nn.Linear and torch.nn.Conv2d to LLLiteLinear and LLLiteConv2d in U-Net")
+    sdxl_original_unet.torch.nn.Linear = LLLiteLinear
+    sdxl_original_unet.torch.nn.Conv2d = LLLiteConv2d
+if __name__ == "__main__":
+    # デバッグ用 / for debug
+    # sdxl_original_unet.USE_REENTRANT = False
+    replace_unet_linear_and_conv2d()
+    # test shape etc
+    logger.info("create unet")
+    unet = SdxlUNet2DConditionModelControlNetLLLite()
+    logger.info("enable ControlNet-LLLite")
+    unet.apply_lllite(32, 64, None, False, 1.0)
+    unet.to("cuda")  # .to(torch.float16)
+    # from safetensors.torch import load_file
+    # model_sd = load_file(r"E:\Work\SD\Models\sdxl\sd_xl_base_1.0_0.9vae.safetensors")
+    # unet_sd = {}
+    # # copy U-Net keys from unet_state_dict to state_dict
+    # prefix = "model.diffusion_model."
+    # for key in model_sd.keys():
+    #     if key.startswith(prefix):
+    #         converted_key = key[len(prefix) :]
+    #         unet_sd[converted_key] = model_sd[key]
+    # info = unet.load_lllite_weights("r:/lllite_from_unet.safetensors", unet_sd)
+    # logger.info(info)
+    # logger.info(unet)
+    # logger.info number of parameters
+    params = unet.prepare_params()
+    logger.info(f"number of parameters {sum(p.numel() for p in params)}")
+    # logger.info("type any key to continue")
+    # input()
+    unet.set_use_memory_efficient_attention(True, False)
+    unet.set_gradient_checkpointing(True)
+    unet.train()  # for gradient checkpointing
+    # # visualize
+    # import torchviz
+    # logger.info("run visualize")
+    # controlnet.set_control(conditioning_image)
+    # output = unet(x, t, ctx, y)
+    # logger.info("make_dot")
+    # image = torchviz.make_dot(output, params=dict(controlnet.named_parameters()))
+    # logger.info("render")
+    # image.format = "svg" # "png"
+    # image.render("NeuralNet") # すごく時間がかかるので注意 / be careful because it takes a long time
+    # input()
+    import bitsandbytes
+    optimizer = bitsandbytes.adam.Adam8bit(params, 1e-3)
+    scaler = torch.cuda.amp.GradScaler(enabled=True)
+    logger.info("start training")
+    steps = 10
+    batch_size = 1
+    sample_param = [p for p in unet.named_parameters() if ".lllite_up." in p[0]][0]
+    for step in range(steps):
+        logger.info(f"step {step}")
+        conditioning_image = torch.rand(batch_size, 3, 1024, 1024).cuda() * 2.0 - 1.0
+        x = torch.randn(batch_size, 4, 128, 128).cuda()
+        t = torch.randint(low=0, high=10, size=(batch_size,)).cuda()
+        ctx = torch.randn(batch_size, 77, 2048).cuda()
+        y = torch.randn(batch_size, sdxl_original_unet.ADM_IN_CHANNELS).cuda()
+        with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
+            output = unet(x, t, ctx, y, conditioning_image)
+            target = torch.randn_like(output)
+            loss = torch.nn.functional.mse_loss(output, target)
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+        optimizer.zero_grad(set_to_none=True)
+        logger.info(sample_param)
+    # from safetensors.torch import save_file
+    # logger.info("save weights")
+    # unet.save_lllite_weights("r:/lllite_from_unet.safetensors", torch.float16, None)

convert_diffusers20_original_sd.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# convert Diffusers v1.x/v2.0 model to original Stable Diffusion
+import argparse
+import os
+import torch
+from diffusers import StableDiffusionPipeline
+import library.model_util as model_util
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def convert(args):
+    # 引数を確認する
+    load_dtype = torch.float16 if args.fp16 else None
+    save_dtype = None
+    if args.fp16 or args.save_precision_as == "fp16":
+        save_dtype = torch.float16
+    elif args.bf16 or args.save_precision_as == "bf16":
+        save_dtype = torch.bfloat16
+    elif args.float or args.save_precision_as == "float":
+        save_dtype = torch.float
+    is_load_ckpt = os.path.isfile(args.model_to_load)
+    is_save_ckpt = len(os.path.splitext(args.model_to_save)[1]) > 0
+    assert not is_load_ckpt or args.v1 != args.v2, "v1 or v2 is required to load checkpoint / checkpointの読み込みにはv1/v2指定が必要です"
+    # assert (
+    #     is_save_ckpt or args.reference_model is not None
+    # ), f"reference model is required to save as Diffusers / Diffusers形式での保存には参照モデルが必要です"
+    # モデルを読み込む
+    msg = "checkpoint" if is_load_ckpt else ("Diffusers" + (" as fp16" if args.fp16 else ""))
+    logger.info(f"loading {msg}: {args.model_to_load}")
+    if is_load_ckpt:
+        v2_model = args.v2
+        text_encoder, vae, unet = model_util.load_models_from_stable_diffusion_checkpoint(
+            v2_model, args.model_to_load, unet_use_linear_projection_in_v2=args.unet_use_linear_projection
+        )
+    else:
+        pipe = StableDiffusionPipeline.from_pretrained(
+            args.model_to_load, torch_dtype=load_dtype, tokenizer=None, safety_checker=None, variant=args.variant
+        )
+        text_encoder = pipe.text_encoder
+        vae = pipe.vae
+        unet = pipe.unet
+        if args.v1 == args.v2:
+            # 自動判定する
+            v2_model = unet.config.cross_attention_dim == 1024
+            logger.info("checking model version: model is " + ("v2" if v2_model else "v1"))
+        else:
+            v2_model = not args.v1
+    # 変換して保存する
+    msg = ("checkpoint" + ("" if save_dtype is None else f" in {save_dtype}")) if is_save_ckpt else "Diffusers"
+    logger.info(f"converting and saving as {msg}: {args.model_to_save}")
+    if is_save_ckpt:
+        original_model = args.model_to_load if is_load_ckpt else None
+        key_count = model_util.save_stable_diffusion_checkpoint(
+            v2_model,
+            args.model_to_save,
+            text_encoder,
+            unet,
+            original_model,
+            args.epoch,
+            args.global_step,
+            None if args.metadata is None else eval(args.metadata),
+            save_dtype=save_dtype,
+            vae=vae,
+        )
+        logger.info(f"model saved. total converted state_dict keys: {key_count}")
+    else:
+        logger.info(
+            f"copy scheduler/tokenizer config from: {args.reference_model if args.reference_model is not None else 'default model'}"
+        )
+        model_util.save_diffusers_checkpoint(
+            v2_model, args.model_to_save, text_encoder, unet, args.reference_model, vae, args.use_safetensors
+        )
+        logger.info("model saved.")
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--v1", action="store_true", help="load v1.x model (v1 or v2 is required to load checkpoint) / 1.xのモデルを読み込む"
+    )
+    parser.add_argument(
+        "--v2", action="store_true", help="load v2.0 model (v1 or v2 is required to load checkpoint) / 2.0のモデルを読み込む"
+    )
+    parser.add_argument(
+        "--unet_use_linear_projection",
+        action="store_true",
+        help="When saving v2 model as Diffusers, set U-Net config to `use_linear_projection=true` (to match stabilityai's model) / Diffusers形式でv2モデルを保存するときにU-Netの設定を`use_linear_projection=true`にする（stabilityaiのモデルと合わせる）",
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="load as fp16 (Diffusers only) and save as fp16 (checkpoint only) / fp16形式で読み込み（Diffusers形式のみ対応）、保存する（checkpointのみ対応）",
+    )
+    parser.add_argument("--bf16", action="store_true", help="save as bf16 (checkpoint only) / bf16形式で保存する（checkpointのみ対応）")
+    parser.add_argument(
+        "--float", action="store_true", help="save as float (checkpoint only) / float(float32)形式で保存する（checkpointのみ対応）"
+    )
+    parser.add_argument(
+        "--save_precision_as",
+        type=str,
+        default="no",
+        choices=["fp16", "bf16", "float"],
+        help="save precision, do not specify with --fp16/--bf16/--float / 保存する精度、--fp16/--bf16/--floatと併用しないでくださ���",
+    )
+    parser.add_argument("--epoch", type=int, default=0, help="epoch to write to checkpoint / checkpointに記録するepoch数の値")
+    parser.add_argument(
+        "--global_step", type=int, default=0, help="global_step to write to checkpoint / checkpointに記録するglobal_stepの値"
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        default=None,
+        help='モデルに保存されるメタデータ、Pythonの辞書形式で指定 / metadata: metadata written in to the model in Python Dictionary. Example metadata: \'{"name": "model_name", "resolution": "512x512"}\'',
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="読む込むDiffusersのvariantを指定する、例: fp16 / variant: Diffusers variant to load. Example: fp16",
+    )
+    parser.add_argument(
+        "--reference_model",
+        type=str,
+        default=None,
+        help="scheduler/tokenizerのコピー元Diffusersモデル、Diffusers形式で保存するときに使用される、省略時は`runwayml/stable-diffusion-v1-5` または `stabilityai/stable-diffusion-2-1` / reference Diffusers model to copy scheduler/tokenizer config from, used when saving as Diffusers format, default is `runwayml/stable-diffusion-v1-5` or `stabilityai/stable-diffusion-2-1`",
+    )
+    parser.add_argument(
+        "--use_safetensors",
+        action="store_true",
+        help="use safetensors format to save Diffusers model (checkpoint depends on the file extension) / Duffusersモデルをsafetensors形式で保存する（checkpointは拡張子で自動判定）",
+    )
+    parser.add_argument(
+        "model_to_load",
+        type=str,
+        default=None,
+        help="model to load: checkpoint file or Diffusers model's directory / 読み込むモデル、checkpointかDiffusers形式モデルのディレクトリ",
+    )
+    parser.add_argument(
+        "model_to_save",
+        type=str,
+        default=None,
+        help="model to save: checkpoint (with extension) or Diffusers model's directory (without extension) / 変換後のモデル、拡張子がある場合はcheckpoint、ない場合はDiffusesモデルとして保存",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    convert(args)

custom_train_functions.py ADDED Viewed

	@@ -0,0 +1,559 @@

+import torch
+import argparse
+import random
+import re
+from typing import List, Optional, Union
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def prepare_scheduler_for_custom_training(noise_scheduler, device):
+    if hasattr(noise_scheduler, "all_snr"):
+        return
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+    sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+    alpha = sqrt_alphas_cumprod
+    sigma = sqrt_one_minus_alphas_cumprod
+    all_snr = (alpha / sigma) ** 2
+    noise_scheduler.all_snr = all_snr.to(device)
+def fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler):
+    # fix beta: zero terminal SNR
+    logger.info(f"fix noise scheduler betas: https://arxiv.org/abs/2305.08891")
+    def enforce_zero_terminal_snr(betas):
+        # Convert betas to alphas_bar_sqrt
+        alphas = 1 - betas
+        alphas_bar = alphas.cumprod(0)
+        alphas_bar_sqrt = alphas_bar.sqrt()
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        # Shift so last timestep is zero.
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        # Scale so first timestep is back to old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+        # Convert alphas_bar_sqrt to betas
+        alphas_bar = alphas_bar_sqrt**2
+        alphas = alphas_bar[1:] / alphas_bar[:-1]
+        alphas = torch.cat([alphas_bar[0:1], alphas])
+        betas = 1 - alphas
+        return betas
+    betas = noise_scheduler.betas
+    betas = enforce_zero_terminal_snr(betas)
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    # logger.info(f"original: {noise_scheduler.betas}")
+    # logger.info(f"fixed: {betas}")
+    noise_scheduler.betas = betas
+    noise_scheduler.alphas = alphas
+    noise_scheduler.alphas_cumprod = alphas_cumprod
+def apply_snr_weight(loss, timesteps, noise_scheduler, gamma, v_prediction=False):
+    snr = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])
+    min_snr_gamma = torch.minimum(snr, torch.full_like(snr, gamma))
+    if v_prediction:
+        snr_weight = torch.div(min_snr_gamma, snr + 1).float().to(loss.device)
+    else:
+        snr_weight = torch.div(min_snr_gamma, snr).float().to(loss.device)
+    loss = loss * snr_weight
+    return loss
+def scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    loss = loss * scale
+    return loss
+def get_snr_scale(timesteps, noise_scheduler):
+    snr_t = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])  # batch_size
+    snr_t = torch.minimum(snr_t, torch.ones_like(snr_t) * 1000)  # if timestep is 0, snr_t is inf, so limit it to 1000
+    scale = snr_t / (snr_t + 1)
+    # # show debug info
+    # logger.info(f"timesteps: {timesteps}, snr_t: {snr_t}, scale: {scale}")
+    return scale
+def add_v_prediction_like_loss(loss, timesteps, noise_scheduler, v_pred_like_loss):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    # logger.info(f"add v-prediction like loss: {v_pred_like_loss}, scale: {scale}, loss: {loss}, time: {timesteps}")
+    loss = loss + loss / scale * v_pred_like_loss
+    return loss
+def apply_debiased_estimation(loss, timesteps, noise_scheduler, v_prediction=False):
+    snr_t = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])  # batch_size
+    snr_t = torch.minimum(snr_t, torch.ones_like(snr_t) * 1000)  # if timestep is 0, snr_t is inf, so limit it to 1000
+    if v_prediction:
+        weight = 1 / (snr_t + 1)
+    else:
+        weight = 1 / torch.sqrt(snr_t)
+    loss = weight * loss
+    return loss
+# TODO train_utilと分散しているのでどちらかに寄せる
+def add_custom_train_arguments(parser: argparse.ArgumentParser, support_weighted_captions: bool = True):
+    parser.add_argument(
+        "--min_snr_gamma",
+        type=float,
+        default=None,
+        help="gamma for reducing the weight of high loss timesteps. Lower numbers have stronger effect. 5 is recommended by paper. / 低いタイムステップでの高いlossに対して重みを減らすためのgamma値、低いほど効果が強く、論文では5が推奨",
+    )
+    parser.add_argument(
+        "--scale_v_pred_loss_like_noise_pred",
+        action="store_true",
+        help="scale v-prediction loss like noise prediction loss / v-prediction lossをnoise prediction lossと同じようにスケーリングする",
+    )
+    parser.add_argument(
+        "--v_pred_like_loss",
+        type=float,
+        default=None,
+        help="add v-prediction like loss multiplied by this value / v-prediction lossをこの値をかけ��ものをlossに加算する",
+    )
+    parser.add_argument(
+        "--debiased_estimation_loss",
+        action="store_true",
+        help="debiased estimation loss / debiased estimation loss",
+    )
+    if support_weighted_captions:
+        parser.add_argument(
+            "--weighted_captions",
+            action="store_true",
+            default=False,
+            help="Enable weighted captions in the standard style (token:1.3). No commas inside parens, or shuffle/dropout may break the decoder. / 「[token]」、「(token)」「(token:1.3)」のような重み付きキャプションを有効にする。カンマを括弧内に入れるとシャッフルやdropoutで重みづけがおかしくなるので注意",
+        )
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+def get_prompts_with_weights(tokenizer, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+    return tokens, weights
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+    return tokens, weights
+def get_unweighted_text_embeddings(
+    tokenizer,
+    text_encoder,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    clip_skip: int,
+    eos: int,
+    pad: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            if pad == eos:  # v1
+                text_input_chunk[:, -1] = text_input[0, -1]
+            else:  # v2
+                for j in range(len(text_input_chunk)):
+                    if text_input_chunk[j, -1] != eos and text_input_chunk[j, -1] != pad:  # 最後に普通の文字がある
+                        text_input_chunk[j, -1] = eos
+                    if text_input_chunk[j, 1] == pad:  # BOSだけであとはPAD
+                        text_input_chunk[j, 1] = eos
+            if clip_skip is None or clip_skip == 1:
+                text_embedding = text_encoder(text_input_chunk)[0]
+            else:
+                enc_out = text_encoder(text_input_chunk, output_hidden_states=True, return_dict=True)
+                text_embedding = enc_out["hidden_states"][-clip_skip]
+                text_embedding = text_encoder.text_model.final_layer_norm(text_embedding)
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+            text_embeddings.append(text_embedding)
+        text_embeddings = torch.concat(text_embeddings, axis=1)
+    else:
+        if clip_skip is None or clip_skip == 1:
+            text_embeddings = text_encoder(text_input)[0]
+        else:
+            enc_out = text_encoder(text_input, output_hidden_states=True, return_dict=True)
+            text_embeddings = enc_out["hidden_states"][-clip_skip]
+            text_embeddings = text_encoder.text_model.final_layer_norm(text_embeddings)
+    return text_embeddings
+def get_weighted_text_embeddings(
+    tokenizer,
+    text_encoder,
+    prompt: Union[str, List[str]],
+    device,
+    max_embeddings_multiples: Optional[int] = 3,
+    no_boseos_middle: Optional[bool] = False,
+    clip_skip=None,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+    Args:
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    prompt_tokens, prompt_weights = get_prompts_with_weights(tokenizer, prompt, max_length - 2)
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (tokenizer.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    # pad the length of tokens and weights
+    bos = tokenizer.bos_token_id
+    eos = tokenizer.eos_token_id
+    pad = tokenizer.pad_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=tokenizer.model_max_length,
+    )
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=device)
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        tokenizer,
+        text_encoder,
+        prompt_tokens,
+        tokenizer.model_max_length,
+        clip_skip,
+        eos,
+        pad,
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=device)
+    # assign weights to the prompts and normalize in the sense of mean
+    previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+    text_embeddings = text_embeddings * prompt_weights.unsqueeze(-1)
+    current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+    text_embeddings = text_embeddings * (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+    return text_embeddings
+# https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2
+def pyramid_noise_like(noise, device, iterations=6, discount=0.4):
+    b, c, w, h = noise.shape  # EDIT: w and h get over-written, rename for a different variant!
+    u = torch.nn.Upsample(size=(w, h), mode="bilinear").to(device)
+    for i in range(iterations):
+        r = random.random() * 2 + 2  # Rather than always going 2x,
+        wn, hn = max(1, int(w / (r**i))), max(1, int(h / (r**i)))
+        noise += u(torch.randn(b, c, wn, hn).to(device)) * discount**i
+        if wn == 1 or hn == 1:
+            break  # Lowest resolution is 1x1
+    return noise / noise.std()  # Scaled back to roughly unit variance
+# https://www.crosslabs.org//blog/diffusion-with-offset-noise
+def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
+    if noise_offset is None:
+        return noise
+    if adaptive_noise_scale is not None:
+        # latent shape: (batch_size, channels, height, width)
+        # abs mean value for each channel
+        latent_mean = torch.abs(latents.mean(dim=(2, 3), keepdim=True))
+        # multiply adaptive noise scale to the mean value and add it to the noise offset
+        noise_offset = noise_offset + adaptive_noise_scale * latent_mean
+        noise_offset = torch.clamp(noise_offset, 0.0, None)  # in case of adaptive noise scale is negative
+    noise = noise + noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+    return noise
+def apply_masked_loss(loss, batch):
+    if "conditioning_images" in batch:
+        # conditioning image is -1 to 1. we need to convert it to 0 to 1
+        mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
+        mask_image = mask_image / 2 + 0.5
+        # print(f"conditioning_image: {mask_image.shape}")
+    elif "alpha_masks" in batch and batch["alpha_masks"] is not None:
+        # alpha mask is 0 to 1
+        mask_image = batch["alpha_masks"].to(dtype=loss.dtype).unsqueeze(1) # add channel dimension
+        # print(f"mask_image: {mask_image.shape}, {mask_image.mean()}")
+    else:
+        return loss
+    # resize to the same size as the loss
+    mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
+    loss = loss * mask_image
+    return loss
+"""
+##########################################
+# Perlin Noise
+def rand_perlin_2d(device, shape, res, fade=lambda t: 6 * t**5 - 15 * t**4 + 10 * t**3):
+    delta = (res[0] / shape[0], res[1] / shape[1])
+    d = (shape[0] // res[0], shape[1] // res[1])
+    grid = (
+        torch.stack(
+            torch.meshgrid(torch.arange(0, res[0], delta[0], device=device), torch.arange(0, res[1], delta[1], device=device)),
+            dim=-1,
+        )
+        % 1
+    )
+    angles = 2 * torch.pi * torch.rand(res[0] + 1, res[1] + 1, device=device)
+    gradients = torch.stack((torch.cos(angles), torch.sin(angles)), dim=-1)
+    tile_grads = (
+        lambda slice1, slice2: gradients[slice1[0] : slice1[1], slice2[0] : slice2[1]]
+        .repeat_interleave(d[0], 0)
+        .repeat_interleave(d[1], 1)
+    )
+    dot = lambda grad, shift: (
+        torch.stack((grid[: shape[0], : shape[1], 0] + shift[0], grid[: shape[0], : shape[1], 1] + shift[1]), dim=-1)
+        * grad[: shape[0], : shape[1]]
+    ).sum(dim=-1)
+    n00 = dot(tile_grads([0, -1], [0, -1]), [0, 0])
+    n10 = dot(tile_grads([1, None], [0, -1]), [-1, 0])
+    n01 = dot(tile_grads([0, -1], [1, None]), [0, -1])
+    n11 = dot(tile_grads([1, None], [1, None]), [-1, -1])
+    t = fade(grid[: shape[0], : shape[1]])
+    return 1.414 * torch.lerp(torch.lerp(n00, n10, t[..., 0]), torch.lerp(n01, n11, t[..., 0]), t[..., 1])
+def rand_perlin_2d_octaves(device, shape, res, octaves=1, persistence=0.5):
+    noise = torch.zeros(shape, device=device)
+    frequency = 1
+    amplitude = 1
+    for _ in range(octaves):
+        noise += amplitude * rand_perlin_2d(device, shape, (frequency * res[0], frequency * res[1]))
+        frequency *= 2
+        amplitude *= persistence
+    return noise
+def perlin_noise(noise, device, octaves):
+    _, c, w, h = noise.shape
+    perlin = lambda: rand_perlin_2d_octaves(device, (w, h), (4, 4), octaves)
+    noise_perlin = []
+    for _ in range(c):
+        noise_perlin.append(perlin())
+    noise_perlin = torch.stack(noise_perlin).unsqueeze(0)   # (1, c, w, h)
+    noise += noise_perlin # broadcast for each batch
+    return noise / noise.std()  # Scaled back to roughly unit variance
+"""

deepspeed_utils.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import argparse
+import torch
+from accelerate import DeepSpeedPlugin, Accelerator
+from .utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def add_deepspeed_arguments(parser: argparse.ArgumentParser):
+    # DeepSpeed Arguments. https://huggingface.co/docs/accelerate/usage_guides/deepspeed
+    parser.add_argument("--deepspeed", action="store_true", help="enable deepspeed training")
+    parser.add_argument("--zero_stage", type=int, default=2, choices=[0, 1, 2, 3], help="Possible options are 0,1,2,3.")
+    parser.add_argument(
+        "--offload_optimizer_device",
+        type=str,
+        default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3.",
+    )
+    parser.add_argument(
+        "--offload_optimizer_nvme_path",
+        type=str,
+        default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--offload_param_device",
+        type=str,
+        default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--offload_param_nvme_path",
+        type=str,
+        default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--zero3_init_flag",
+        action="store_true",
+        help="Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models."
+        "Only applicable with ZeRO Stage-3.",
+    )
+    parser.add_argument(
+        "--zero3_save_16bit_model",
+        action="store_true",
+        help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.",
+    )
+    parser.add_argument(
+        "--fp16_master_weights_and_gradients",
+        action="store_true",
+        help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32.",
+    )
+def prepare_deepspeed_args(args: argparse.Namespace):
+    if not args.deepspeed:
+        return
+    # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
+    args.max_data_loader_n_workers = 1
+def prepare_deepspeed_plugin(args: argparse.Namespace):
+    if not args.deepspeed:
+        return None
+    try:
+        import deepspeed
+    except ImportError as e:
+        logger.error(
+            "deepspeed is not installed. please install deepspeed in your environment with following command. DS_BUILD_OPS=0 pip install deepspeed"
+        )
+        exit(1)
+    deepspeed_plugin = DeepSpeedPlugin(
+        zero_stage=args.zero_stage,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        gradient_clipping=args.max_grad_norm,
+        offload_optimizer_device=args.offload_optimizer_device,
+        offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
+        offload_param_device=args.offload_param_device,
+        offload_param_nvme_path=args.offload_param_nvme_path,
+        zero3_init_flag=args.zero3_init_flag,
+        zero3_save_16bit_model=args.zero3_save_16bit_model,
+    )
+    deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = args.train_batch_size
+    deepspeed_plugin.deepspeed_config["train_batch_size"] = (
+        args.train_batch_size * args.gradient_accumulation_steps * int(os.environ["WORLD_SIZE"])
+    )
+    deepspeed_plugin.set_mixed_precision(args.mixed_precision)
+    if args.mixed_precision.lower() == "fp16":
+        deepspeed_plugin.deepspeed_config["fp16"]["initial_scale_power"] = 0  # preventing overflow.
+    if args.full_fp16 or args.fp16_master_weights_and_gradients:
+        if args.offload_optimizer_device == "cpu" and args.zero_stage == 2:
+            deepspeed_plugin.deepspeed_config["fp16"]["fp16_master_weights_and_grads"] = True
+            logger.info("[DeepSpeed] full fp16 enable.")
+        else:
+            logger.info(
+                "[DeepSpeed]full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam on ZeRO-2 stage."
+            )
+    if args.offload_optimizer_device is not None:
+        logger.info("[DeepSpeed] start to manually build cpu_adam.")
+        deepspeed.ops.op_builder.CPUAdamBuilder().load()
+        logger.info("[DeepSpeed] building cpu_adam done.")
+    return deepspeed_plugin
+# Accelerate library does not support multiple models for deepspeed. So, we need to wrap multiple models into a single model.
+def prepare_deepspeed_model(args: argparse.Namespace, **models):
+    # remove None from models
+    models = {k: v for k, v in models.items() if v is not None}
+    class DeepSpeedWrapper(torch.nn.Module):
+        def __init__(self, **kw_models) -> None:
+            super().__init__()
+            self.models = torch.nn.ModuleDict()
+            for key, model in kw_models.items():
+                if isinstance(model, list):
+                    model = torch.nn.ModuleList(model)
+                assert isinstance(
+                    model, torch.nn.Module
+                ), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
+                self.models.update(torch.nn.ModuleDict({key: model}))
+        def get_models(self):
+            return self.models
+    ds_model = DeepSpeedWrapper(**models)
+    return ds_model

dependabot.yml ADDED Viewed

	@@ -0,0 +1,7 @@

+---
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"

detect_face_rotate.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# このスクリプトのライセンスは、train_dreambooth.pyと同じくApache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+# 横長の画像から顔検出して正立するように回転し、そこを中心に正方形に切り出す
+# v2: extract max face if multiple faces are found
+# v3: add crop_ratio option
+# v4: add multiple faces extraction and min/max size
+import argparse
+import math
+import cv2
+import glob
+import os
+from anime_face_detector import create_detector
+from tqdm import tqdm
+import numpy as np
+from library.utils import setup_logging, pil_resize
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+KP_REYE = 11
+KP_LEYE = 19
+SCORE_THRES = 0.90
+def detect_faces(detector, image, min_size):
+  preds = detector(image)                     # bgr
+  # logger.info(len(preds))
+  faces = []
+  for pred in preds:
+    bb = pred['bbox']
+    score = bb[-1]
+    if score < SCORE_THRES:
+      continue
+    left, top, right, bottom = bb[:4]
+    cx = int((left + right) / 2)
+    cy = int((top + bottom) / 2)
+    fw = int(right - left)
+    fh = int(bottom - top)
+    lex, ley = pred['keypoints'][KP_LEYE, 0:2]
+    rex, rey = pred['keypoints'][KP_REYE, 0:2]
+    angle = math.atan2(ley - rey, lex - rex)
+    angle = angle / math.pi * 180
+    faces.append((cx, cy, fw, fh, angle))
+  faces.sort(key=lambda x: max(x[2], x[3]), reverse=True)         # 大きい順
+  return faces
+def rotate_image(image, angle, cx, cy):
+  h, w = image.shape[0:2]
+  rot_mat = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
+  # # 回転する分、すこし画像サイズを大きくする→とりあえず無効化
+  # nh = max(h, int(w * math.sin(angle)))
+  # nw = max(w, int(h * math.sin(angle)))
+  # if nh > h or nw > w:
+  #   pad_y = nh - h
+  #   pad_t = pad_y // 2
+  #   pad_x = nw - w
+  #   pad_l = pad_x // 2
+  #   m = np.array([[0, 0, pad_l],
+  #                 [0, 0, pad_t]])
+  #   rot_mat = rot_mat + m
+  #   h, w = nh, nw
+  #   cx += pad_l
+  #   cy += pad_t
+  result = cv2.warpAffine(image, rot_mat, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT)
+  return result, cx, cy
+def process(args):
+  assert (not args.resize_fit) or args.resize_face_size is None, f"resize_fit and resize_face_size can't be specified both / resize_fitとresize_face_sizeはどちらか片方しか指定できません"
+  assert args.crop_ratio is None or args.resize_face_size is None, f"crop_ratio指定時はresize_face_sizeは指定できません"
+  # アニメ顔検出モデルを読み込む
+  logger.info("loading face detector.")
+  detector = create_detector('yolov3')
+  # cropの引数を解析する
+  if args.crop_size is None:
+    crop_width = crop_height = None
+  else:
+    tokens = args.crop_size.split(',')
+    assert len(tokens) == 2, f"crop_size must be 'width,height' / crop_sizeは'幅,高さ'で指定してください"
+    crop_width, crop_height = [int(t) for t in tokens]
+  if args.crop_ratio is None:
+    crop_h_ratio = crop_v_ratio = None
+  else:
+    tokens = args.crop_ratio.split(',')
+    assert len(tokens) == 2, f"crop_ratio must be 'horizontal,vertical' / crop_ratioは'幅,高さ'の倍率で指定してください"
+    crop_h_ratio, crop_v_ratio = [float(t) for t in tokens]
+  # 画像を処理する
+  logger.info("processing.")
+  output_extension = ".png"
+  os.makedirs(args.dst_dir, exist_ok=True)
+  paths = glob.glob(os.path.join(args.src_dir, "*.png")) + glob.glob(os.path.join(args.src_dir, "*.jpg")) + \
+      glob.glob(os.path.join(args.src_dir, "*.webp"))
+  for path in tqdm(paths):
+    basename = os.path.splitext(os.path.basename(path))[0]
+    # image = cv2.imread(path)        # 日本語ファイル名でエラーになる
+    image = cv2.imdecode(np.fromfile(path, np.uint8), cv2.IMREAD_UNCHANGED)
+    if len(image.shape) == 2:
+      image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+    if image.shape[2] == 4:
+      logger.warning(f"image has alpha. ignore / 画像の透明度が設定されているため無視します: {path}")
+      image = image[:, :, :3].copy()                    # copyをしないと内部的に透明度情報が付いたままになるらしい
+    h, w = image.shape[:2]
+    faces = detect_faces(detector, image, args.multiple_faces)
+    for i, face in enumerate(faces):
+      cx, cy, fw, fh, angle = face
+      face_size = max(fw, fh)
+      if args.min_size is not None and face_size < args.min_size:
+        continue
+      if args.max_size is not None and face_size >= args.max_size:
+        continue
+      face_suffix = f"_{i+1:02d}" if args.multiple_faces else ""
+      # オプション指定があれば回転する
+      face_img = image
+      if args.rotate:
+        face_img, cx, cy = rotate_image(face_img, angle, cx, cy)
+      # オプション指定があれば顔を中心に切り出す
+      if crop_width is not None or crop_h_ratio is not None:
+        cur_crop_width, cur_crop_height = crop_width, crop_height
+        if crop_h_ratio is not None:
+          cur_crop_width = int(face_size * crop_h_ratio + .5)
+          cur_crop_height = int(face_size * crop_v_ratio + .5)
+        # リサイズを必要なら行う
+        scale = 1.0
+        if args.resize_face_size is not None:
+          # 顔サイズを基準にリサイズする
+          scale = args.resize_face_size / face_size
+          if scale < cur_crop_width / w:
+            logger.warning(
+                f"image width too small in face size based resizing / 顔を基準にリサイズすると画像の幅がcrop sizeより小さい（顔が相対的に大きすぎる）ので顔サイズが変わります: {path}")
+            scale = cur_crop_width / w
+          if scale < cur_crop_height / h:
+            logger.warning(
+                f"image height too small in face size based resizing / 顔を基準にリサイズすると画像の高さがcrop sizeより小さい（顔が相対的に大きすぎる）ので顔サイズが変わります: {path}")
+            scale = cur_crop_height / h
+        elif crop_h_ratio is not None:
+          # 倍率指定の時にはリサイズしない
+          pass
+        else:
+          # 切り出しサイズ指定あり
+          if w < cur_crop_width:
+            logger.warning(f"image width too small/ 画像の幅がcrop sizeより小さいので画質が劣化します: {path}")
+            scale = cur_crop_width / w
+          if h < cur_crop_height:
+            logger.warning(f"image height too small/ 画像の高さがcrop sizeより小さいので画質が劣化します: {path}")
+            scale = cur_crop_height / h
+          if args.resize_fit:
+            scale = max(cur_crop_width / w, cur_crop_height / h)
+        if scale != 1.0:
+          w = int(w * scale + .5)
+          h = int(h * scale + .5)
+          if scale < 1.0:
+            face_img = cv2.resize(face_img, (w, h), interpolation=cv2.INTER_AREA)
+          else:
+            face_img = pil_resize(face_img, (w, h))
+          cx = int(cx * scale + .5)
+          cy = int(cy * scale + .5)
+          fw = int(fw * scale + .5)
+          fh = int(fh * scale + .5)
+        cur_crop_width = min(cur_crop_width, face_img.shape[1])
+        cur_crop_height = min(cur_crop_height, face_img.shape[0])
+        x = cx - cur_crop_width // 2
+        cx = cur_crop_width // 2
+        if x < 0:
+          cx = cx + x
+          x = 0
+        elif x + cur_crop_width > w:
+          cx = cx + (x + cur_crop_width - w)
+          x = w - cur_crop_width
+        face_img = face_img[:, x:x+cur_crop_width]
+        y = cy - cur_crop_height // 2
+        cy = cur_crop_height // 2
+        if y < 0:
+          cy = cy + y
+          y = 0
+        elif y + cur_crop_height > h:
+          cy = cy + (y + cur_crop_height - h)
+          y = h - cur_crop_height
+        face_img = face_img[y:y + cur_crop_height]
+      # # debug
+      # logger.info(path, cx, cy, angle)
+      # crp = cv2.resize(image, (image.shape[1]//8, image.shape[0]//8))
+      # cv2.imshow("image", crp)
+      # if cv2.waitKey() == 27:
+      #   break
+      # cv2.destroyAllWindows()
+      # debug
+      if args.debug:
+        cv2.rectangle(face_img, (cx-fw//2, cy-fh//2), (cx+fw//2, cy+fh//2), (255, 0, 255), fw//20)
+      _, buf = cv2.imencode(output_extension, face_img)
+      with open(os.path.join(args.dst_dir, f"{basename}{face_suffix}_{cx:04d}_{cy:04d}_{fw:04d}_{fh:04d}{output_extension}"), "wb") as f:
+        buf.tofile(f)
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--src_dir", type=str, help="directory to load images / 画像を読み込むディレクトリ")
+  parser.add_argument("--dst_dir", type=str, help="directory to save images / 画像を保存するディレクトリ")
+  parser.add_argument("--rotate", action="store_true", help="rotate images to align faces / 顔が正立するように画像を回転する")
+  parser.add_argument("--resize_fit", action="store_true",
+                      help="resize to fit smaller side after cropping / 切り出し後の画像の短辺がcrop_sizeにあうようにリサイズする")
+  parser.add_argument("--resize_face_size", type=int, default=None,
+                      help="resize image before cropping by face size / 切り出し前に顔がこのサイズになるようにリサイズする")
+  parser.add_argument("--crop_size", type=str, default=None,
+                      help="crop images with 'width,height' pixels, face centered / 顔を中心として'幅,高さ'のサイズで切り出す")
+  parser.add_argument("--crop_ratio", type=str, default=None,
+                      help="crop images with 'horizontal,vertical' ratio to face, face centered / 顔を中心として顔サイズの'幅倍率,高さ倍率'のサイズで切り出す")
+  parser.add_argument("--min_size", type=int, default=None,
+                      help="minimum face size to output (included) / 処理対象とする顔の最小サイズ（この値以上）")
+  parser.add_argument("--max_size", type=int, default=None,
+                      help="maximum face size to output (excluded) / 処理対象とする顔の最大サイズ（この値未満）")
+  parser.add_argument("--multiple_faces", action="store_true",
+                      help="output each faces / 複数の顔が見つかった場合、それぞれを切り出す")
+  parser.add_argument("--debug", action="store_true", help="render rect for face / 処理後画像の顔位置に矩形を描画します")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args = parser.parse_args()
+  process(args)

device_utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import functools
+import gc
+import torch
+try:
+    # intel gpu support for pytorch older than 2.5
+    # ipex is not needed after pytorch 2.5
+    import intel_extension_for_pytorch as ipex  # noqa
+except Exception:
+    pass
+try:
+    HAS_CUDA = torch.cuda.is_available()
+except Exception:
+    HAS_CUDA = False
+try:
+    HAS_MPS = torch.backends.mps.is_available()
+except Exception:
+    HAS_MPS = False
+try:
+    HAS_XPU = torch.xpu.is_available()
+except Exception:
+    HAS_XPU = False
+def clean_memory():
+    gc.collect()
+    if HAS_CUDA:
+        torch.cuda.empty_cache()
+    if HAS_XPU:
+        torch.xpu.empty_cache()
+    if HAS_MPS:
+        torch.mps.empty_cache()
+def clean_memory_on_device(device: torch.device):
+    r"""
+    Clean memory on the specified device, will be called from training scripts.
+    """
+    gc.collect()
+    # device may "cuda" or "cuda:0", so we need to check the type of device
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    if device.type == "xpu":
+        torch.xpu.empty_cache()
+    if device.type == "mps":
+        torch.mps.empty_cache()
+@functools.lru_cache(maxsize=None)
+def get_preferred_device() -> torch.device:
+    r"""
+    Do not call this function from training scripts. Use accelerator.device instead.
+    """
+    if HAS_CUDA:
+        device = torch.device("cuda")
+    elif HAS_XPU:
+        device = torch.device("xpu")
+    elif HAS_MPS:
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    print(f"get_preferred_device() -> {device}")
+    return device
+def init_ipex():
+    """
+    Apply IPEX to CUDA hijacks using `library.ipex.ipex_init`.
+    This function should run right after importing torch and before doing anything else.
+    If xpu is not available, this function does nothing.
+    """
+    try:
+        if HAS_XPU:
+            from library.ipex import ipex_init
+            is_initialized, error_message = ipex_init()
+            if not is_initialized:
+                print("failed to initialize ipex:", error_message)
+        else:
+            return
+    except Exception as e:
+        print("failed to initialize ipex:", e)

diffusers.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from functools import wraps
+import torch
+import diffusers # pylint: disable=import-error
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+# Diffusers FreeU
+original_fourier_filter = diffusers.utils.torch_utils.fourier_filter
+@wraps(diffusers.utils.torch_utils.fourier_filter)
+def fourier_filter(x_in, threshold, scale):
+    return_dtype = x_in.dtype
+    return original_fourier_filter(x_in.to(dtype=torch.float32), threshold, scale).to(dtype=return_dtype)
+# fp64 error
+class FluxPosEmbed(torch.nn.Module):
+    def __init__(self, theta: int, axes_dim):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        cos_out = []
+        sin_out = []
+        pos = ids.float()
+        for i in range(n_axes):
+            cos, sin = diffusers.models.embeddings.get_1d_rotary_pos_embed(
+                self.axes_dim[i],
+                pos[:, i],
+                theta=self.theta,
+                repeat_interleave_real=True,
+                use_real=True,
+                freqs_dtype=torch.float32,
+            )
+            cos_out.append(cos)
+            sin_out.append(sin)
+        freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
+        freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
+        return freqs_cos, freqs_sin
+def ipex_diffusers(device_supports_fp64=False, can_allocate_plus_4gb=False):
+    diffusers.utils.torch_utils.fourier_filter = fourier_filter
+    if not device_supports_fp64:
+        diffusers.models.embeddings.FluxPosEmbed = FluxPosEmbed

dylora.py ADDED Viewed

	@@ -0,0 +1,529 @@

+# some codes are copied from:
+# https://github.com/huawei-noah/KD-NLP/blob/main/DyLoRA/
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+# Changes made to the original code:
+# 2022.08.20 - Integrate the DyLoRA layer for the LoRA Linear layer
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import math
+import os
+import random
+from typing import Dict, List, Optional, Tuple, Type, Union
+from diffusers import AutoencoderKL
+from transformers import CLIPTextModel
+import torch
+from torch import nn
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+class DyLoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+    # NOTE: support dropout in future
+    def __init__(self, lora_name, org_module: torch.nn.Module, multiplier=1.0, lora_dim=4, alpha=1, unit=1):
+        super().__init__()
+        self.lora_name = lora_name
+        self.lora_dim = lora_dim
+        self.unit = unit
+        assert self.lora_dim % self.unit == 0, "rank must be a multiple of unit"
+        if org_module.__class__.__name__ == "Conv2d":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
+        self.is_conv2d = org_module.__class__.__name__ == "Conv2d"
+        self.is_conv2d_3x3 = self.is_conv2d and org_module.kernel_size == (3, 3)
+        if self.is_conv2d and self.is_conv2d_3x3:
+            kernel_size = org_module.kernel_size
+            self.stride = org_module.stride
+            self.padding = org_module.padding
+            self.lora_A = nn.ParameterList([org_module.weight.new_zeros((1, in_dim, *kernel_size)) for _ in range(self.lora_dim)])
+            self.lora_B = nn.ParameterList([org_module.weight.new_zeros((out_dim, 1, 1, 1)) for _ in range(self.lora_dim)])
+        else:
+            self.lora_A = nn.ParameterList([org_module.weight.new_zeros((1, in_dim)) for _ in range(self.lora_dim)])
+            self.lora_B = nn.ParameterList([org_module.weight.new_zeros((out_dim, 1)) for _ in range(self.lora_dim)])
+        # same as microsoft's
+        for lora in self.lora_A:
+            torch.nn.init.kaiming_uniform_(lora, a=math.sqrt(5))
+        for lora in self.lora_B:
+            torch.nn.init.zeros_(lora)
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+    def apply_to(self):
+        self.org_forward = self.org_module.forward
+        self.org_module.forward = self.forward
+        del self.org_module
+    def forward(self, x):
+        result = self.org_forward(x)
+        # specify the dynamic rank
+        trainable_rank = random.randint(0, self.lora_dim - 1)
+        trainable_rank = trainable_rank - trainable_rank % self.unit  # make sure the rank is a multiple of unit
+        # 一部のパラメータを固定して、残りのパラメータを学習する
+        for i in range(0, trainable_rank):
+            self.lora_A[i].requires_grad = False
+            self.lora_B[i].requires_grad = False
+        for i in range(trainable_rank, trainable_rank + self.unit):
+            self.lora_A[i].requires_grad = True
+            self.lora_B[i].requires_grad = True
+        for i in range(trainable_rank + self.unit, self.lora_dim):
+            self.lora_A[i].requires_grad = False
+            self.lora_B[i].requires_grad = False
+        lora_A = torch.cat(tuple(self.lora_A), dim=0)
+        lora_B = torch.cat(tuple(self.lora_B), dim=1)
+        # calculate with lora_A and lora_B
+        if self.is_conv2d_3x3:
+            ab = torch.nn.functional.conv2d(x, lora_A, stride=self.stride, padding=self.padding)
+            ab = torch.nn.functional.conv2d(ab, lora_B)
+        else:
+            ab = x
+            if self.is_conv2d:
+                ab = ab.reshape(ab.size(0), ab.size(1), -1).transpose(1, 2)  # (N, C, H, W) -> (N, H*W, C)
+            ab = torch.nn.functional.linear(ab, lora_A)
+            ab = torch.nn.functional.linear(ab, lora_B)
+            if self.is_conv2d:
+                ab = ab.transpose(1, 2).reshape(ab.size(0), -1, *x.size()[2:])  # (N, H*W, C) -> (N, C, H, W)
+        # 最後の項は、低rankをより大きくするためのスケー���ング（じゃないかな）
+        result = result + ab * self.scale * math.sqrt(self.lora_dim / (trainable_rank + self.unit))
+        # NOTE weightに加算してからlinear/conv2dを呼んだほうが速いかも
+        return result
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        # state dictを通常のLoRAと同じにする:
+        # nn.ParameterListは `.lora_A.0` みたいな名前になるので、forwardと同様にcatして入れ替える
+        sd = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+        lora_A_weight = torch.cat(tuple(self.lora_A), dim=0)
+        if self.is_conv2d and not self.is_conv2d_3x3:
+            lora_A_weight = lora_A_weight.unsqueeze(-1).unsqueeze(-1)
+        lora_B_weight = torch.cat(tuple(self.lora_B), dim=1)
+        if self.is_conv2d and not self.is_conv2d_3x3:
+            lora_B_weight = lora_B_weight.unsqueeze(-1).unsqueeze(-1)
+        sd[self.lora_name + ".lora_down.weight"] = lora_A_weight if keep_vars else lora_A_weight.detach()
+        sd[self.lora_name + ".lora_up.weight"] = lora_B_weight if keep_vars else lora_B_weight.detach()
+        i = 0
+        while True:
+            key_a = f"{self.lora_name}.lora_A.{i}"
+            key_b = f"{self.lora_name}.lora_B.{i}"
+            if key_a in sd:
+                sd.pop(key_a)
+                sd.pop(key_b)
+            else:
+                break
+            i += 1
+        return sd
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        # 通常のLoRAと同じstate dictを読み込めるようにする：この方法はchatGPTに聞いた
+        lora_A_weight = state_dict.pop(self.lora_name + ".lora_down.weight", None)
+        lora_B_weight = state_dict.pop(self.lora_name + ".lora_up.weight", None)
+        if lora_A_weight is None or lora_B_weight is None:
+            if strict:
+                raise KeyError(f"{self.lora_name}.lora_down/up.weight is not found")
+            else:
+                return
+        if self.is_conv2d and not self.is_conv2d_3x3:
+            lora_A_weight = lora_A_weight.squeeze(-1).squeeze(-1)
+            lora_B_weight = lora_B_weight.squeeze(-1).squeeze(-1)
+        state_dict.update(
+            {f"{self.lora_name}.lora_A.{i}": nn.Parameter(lora_A_weight[i].unsqueeze(0)) for i in range(lora_A_weight.size(0))}
+        )
+        state_dict.update(
+            {f"{self.lora_name}.lora_B.{i}": nn.Parameter(lora_B_weight[:, i].unsqueeze(1)) for i in range(lora_B_weight.size(1))}
+        )
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: AutoencoderKL,
+    text_encoder: Union[CLIPTextModel, List[CLIPTextModel]],
+    unet,
+    **kwargs,
+):
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+    # extract dim/alpha for conv2d, and block dim
+    conv_dim = kwargs.get("conv_dim", None)
+    conv_alpha = kwargs.get("conv_alpha", None)
+    unit = kwargs.get("unit", None)
+    if conv_dim is not None:
+        conv_dim = int(conv_dim)
+        assert conv_dim == network_dim, "conv_dim must be same as network_dim"
+        if conv_alpha is None:
+            conv_alpha = 1.0
+        else:
+            conv_alpha = float(conv_alpha)
+    if unit is not None:
+        unit = int(unit)
+    else:
+        unit = 1
+    network = DyLoRANetwork(
+        text_encoder,
+        unet,
+        multiplier=multiplier,
+        lora_dim=network_dim,
+        alpha=network_alpha,
+        apply_to_conv=conv_dim is not None,
+        unit=unit,
+        varbose=True,
+    )
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+    return network
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weights_sd=None, for_inference=False, **kwargs):
+    if weights_sd is None:
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file, safe_open
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+    # get dim/alpha mapping
+    modules_dim = {}
+    modules_alpha = {}
+    for key, value in weights_sd.items():
+        if "." not in key:
+            continue
+        lora_name = key.split(".")[0]
+        if "alpha" in key:
+            modules_alpha[lora_name] = value
+        elif "lora_down" in key:
+            dim = value.size()[0]
+            modules_dim[lora_name] = dim
+            # logger.info(f"{lora_name} {value.size()} {dim}")
+    # support old LoRA without alpha
+    for key in modules_dim.keys():
+        if key not in modules_alpha:
+            modules_alpha = modules_dim[key]
+    module_class = DyLoRAModule
+    network = DyLoRANetwork(
+        text_encoder, unet, multiplier=multiplier, modules_dim=modules_dim, modules_alpha=modules_alpha, module_class=module_class
+    )
+    return network, weights_sd
+class DyLoRANetwork(torch.nn.Module):
+    UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel"]
+    UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP"]
+    LORA_PREFIX_UNET = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"
+    def __init__(
+        self,
+        text_encoder,
+        unet,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        apply_to_conv=False,
+        modules_dim=None,
+        modules_alpha=None,
+        unit=1,
+        module_class=DyLoRAModule,
+        varbose=False,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        self.apply_to_conv = apply_to_conv
+        self.loraplus_lr_ratio = None
+        self.loraplus_unet_lr_ratio = None
+        self.loraplus_text_encoder_lr_ratio = None
+        if modules_dim is not None:
+            logger.info("create LoRA network from weights")
+        else:
+            logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}, unit: {unit}")
+            if self.apply_to_conv:
+                logger.info("apply LoRA to Conv2d with kernel size (3,3).")
+        # create module instances
+        def create_modules(is_unet, root_module: torch.nn.Module, target_replace_modules) -> List[DyLoRAModule]:
+            prefix = DyLoRANetwork.LORA_PREFIX_UNET if is_unet else DyLoRANetwork.LORA_PREFIX_TEXT_ENCODER
+            loras = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+                        if is_linear or is_conv2d:
+                            lora_name = prefix + "." + name + "." + child_name
+                            lora_name = lora_name.replace(".", "_")
+                            dim = None
+                            alpha = None
+                            if modules_dim is not None:
+                                if lora_name in modules_dim:
+                                    dim = modules_dim[lora_name]
+                                    alpha = modules_alpha[lora_name]
+                            else:
+                                if is_linear or is_conv2d_1x1 or apply_to_conv:
+                                    dim = self.lora_dim
+                                    alpha = self.alpha
+                            if dim is None or dim == 0:
+                                continue
+                            # dropout and fan_in_fan_out is default
+                            lora = module_class(lora_name, child_module, self.multiplier, dim, alpha, unit)
+                            loras.append(lora)
+            return loras
+        text_encoders = text_encoder if type(text_encoder) == list else [text_encoder]
+        self.text_encoder_loras = []
+        for i, text_encoder in enumerate(text_encoders):
+            if len(text_encoders) > 1:
+                index = i + 1
+                logger.info(f"create LoRA for Text Encoder {index}")
+            else:
+                index = None
+                logger.info("create LoRA for Text Encoder")
+            text_encoder_loras = create_modules(False, text_encoder, DyLoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+            self.text_encoder_loras.extend(text_encoder_loras)
+        # self.text_encoder_loras = create_modules(False, text_encoder, DyLoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+        logger.info(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
+        # extend U-Net target modules if conv2d 3x3 is enabled, or load from weights
+        target_modules = DyLoRANetwork.UNET_TARGET_REPLACE_MODULE
+        if modules_dim is not None or self.apply_to_conv:
+            target_modules += DyLoRANetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
+        self.unet_loras = create_modules(True, unet, target_modules)
+        logger.info(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
+        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
+        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
+        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+        info = self.load_state_dict(weights_sd, False)
+        return info
+    def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True):
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+    """
+    def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
+        apply_text_encoder = apply_unet = False
+        for key in weights_sd.keys():
+            if key.startswith(DyLoRANetwork.LORA_PREFIX_TEXT_ENCODER):
+                apply_text_encoder = True
+            elif key.startswith(DyLoRANetwork.LORA_PREFIX_UNET):
+                apply_unet = True
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            sd_for_lora = {}
+            for key in weights_sd.keys():
+                if key.startswith(lora.lora_name):
+                    sd_for_lora[key[len(lora.lora_name) + 1 :]] = weights_sd[key]
+            lora.merge_to(sd_for_lora, dtype, device)
+        logger.info(f"weights are merged")
+    """
+    # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+        self.requires_grad_(True)
+        all_params = []
+        def assemble_params(loras, lr, ratio):
+            param_groups = {"lora": {}, "plus": {}}
+            for lora in loras:
+                for name, param in lora.named_parameters():
+                    if ratio is not None and "lora_B" in name:
+                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    else:
+                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+            params = []
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+                if len(param_data["params"]) == 0:
+                    continue
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * ratio
+                    else:
+                        param_data["lr"] = lr
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                    continue
+                params.append(param_data)
+            return params
+        if self.text_encoder_loras:
+            params = assemble_params(
+                self.text_encoder_loras,
+                text_encoder_lr if text_encoder_lr is not None else default_lr,
+                self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio,
+            )
+            all_params.extend(params)
+        if self.unet_loras:
+            params = assemble_params(
+                self.unet_loras, default_lr if unet_lr is None else unet_lr, self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio
+            )
+            all_params.extend(params)
+        return all_params
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+    def get_trainable_params(self):
+        return self.parameters()
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+        state_dict = self.state_dict()
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from library import train_util
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+    # mask is a tensor with values from 0 to 1
+    def set_region(self, sub_prompt_index, is_last_network, mask):
+        pass
+    def set_current_generation(self, batch_size, num_sub_prompts, width, height, shared):
+        pass

extract_lora_from_dylora.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Convert LoRA to different rank approximation (should only be used to go to lower rank)
+# This code is based off the extract_lora_from_models.py file which is based on https://github.com/cloneofsimo/lora/blob/develop/lora_diffusion/cli_svd.py
+# Thanks to cloneofsimo
+import argparse
+import math
+import os
+import torch
+from safetensors.torch import load_file, save_file, safe_open
+from tqdm import tqdm
+from library import train_util, model_util
+import numpy as np
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def load_state_dict(file_name):
+    if model_util.is_safetensors(file_name):
+        sd = load_file(file_name)
+        with safe_open(file_name, framework="pt") as f:
+            metadata = f.metadata()
+    else:
+        sd = torch.load(file_name, map_location="cpu")
+        metadata = None
+    return sd, metadata
+def save_to_file(file_name, model, metadata):
+    if model_util.is_safetensors(file_name):
+        save_file(model, file_name, metadata)
+    else:
+        torch.save(model, file_name)
+def split_lora_model(lora_sd, unit):
+    max_rank = 0
+    # Extract loaded lora dim and alpha
+    for key, value in lora_sd.items():
+        if "lora_down" in key:
+            rank = value.size()[0]
+            if rank > max_rank:
+                max_rank = rank
+    logger.info(f"Max rank: {max_rank}")
+    rank = unit
+    split_models = []
+    new_alpha = None
+    while rank < max_rank:
+        logger.info(f"Splitting rank {rank}")
+        new_sd = {}
+        for key, value in lora_sd.items():
+            if "lora_down" in key:
+                new_sd[key] = value[:rank].contiguous()
+            elif "lora_up" in key:
+                new_sd[key] = value[:, :rank].contiguous()
+            else:
+                # なぜかscaleするとおかしくなる……
+                # this_rank = lora_sd[key.replace("alpha", "lora_down.weight")].size()[0]
+                # scale = math.sqrt(this_rank / rank)  # rank is > unit
+                # logger.info(key, value.size(), this_rank, rank, value, scale)
+                # new_alpha = value * scale  # always same
+                # new_sd[key] = new_alpha
+                new_sd[key] = value
+        split_models.append((new_sd, rank, new_alpha))
+        rank += unit
+    return max_rank, split_models
+def split(args):
+    logger.info("loading Model...")
+    lora_sd, metadata = load_state_dict(args.model)
+    logger.info("Splitting Model...")
+    original_rank, split_models = split_lora_model(lora_sd, args.unit)
+    comment = metadata.get("ss_training_comment", "")
+    for state_dict, new_rank, new_alpha in split_models:
+        # update metadata
+        if metadata is None:
+            new_metadata = {}
+        else:
+            new_metadata = metadata.copy()
+        new_metadata["ss_training_comment"] = f"split from DyLoRA, rank {original_rank} to {new_rank}; {comment}"
+        new_metadata["ss_network_dim"] = str(new_rank)
+        # new_metadata["ss_network_alpha"] = str(new_alpha.float().numpy())
+        model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+        metadata["sshs_model_hash"] = model_hash
+        metadata["sshs_legacy_hash"] = legacy_hash
+        filename, ext = os.path.splitext(args.save_to)
+        model_file_name = filename + f"-{new_rank:04d}{ext}"
+        logger.info(f"saving model to: {model_file_name}")
+        save_to_file(model_file_name, state_dict, new_metadata)
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--unit", type=int, default=None, help="size of rank to split into / rankを分割するサイズ")
+    parser.add_argument(
+        "--save_to",
+        type=str,
+        default=None,
+        help="destination base file name: ckpt or safetensors file / 保存先のファイル名のbase、ckptまたはsafetensors",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="DyLoRA model to resize at to new rank: ckpt or safetensors file / 読み込むDyLoRAモデル、ckptまたはsafetensors",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    split(args)

extract_lora_from_models.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# extract approximating LoRA by svd from two SD models
+# The code is based on https://github.com/cloneofsimo/lora/blob/develop/lora_diffusion/cli_svd.py
+# Thanks to cloneofsimo!
+import argparse
+import json
+import os
+import time
+import torch
+from safetensors.torch import load_file, save_file
+from tqdm import tqdm
+from library import sai_model_spec, model_util, sdxl_model_util
+import lora
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+# CLAMP_QUANTILE = 0.99
+# MIN_DIFF = 1e-1
+def save_to_file(file_name, model, state_dict, dtype):
+    if dtype is not None:
+        for key in list(state_dict.keys()):
+            if type(state_dict[key]) == torch.Tensor:
+                state_dict[key] = state_dict[key].to(dtype)
+    if os.path.splitext(file_name)[1] == ".safetensors":
+        save_file(model, file_name)
+    else:
+        torch.save(model, file_name)
+def svd(
+    model_org=None,
+    model_tuned=None,
+    save_to=None,
+    dim=4,
+    v2=None,
+    sdxl=None,
+    conv_dim=None,
+    v_parameterization=None,
+    device=None,
+    save_precision=None,
+    clamp_quantile=0.99,
+    min_diff=0.01,
+    no_metadata=False,
+    load_precision=None,
+    load_original_model_to=None,
+    load_tuned_model_to=None,
+):
+    def str_to_dtype(p):
+        if p == "float":
+            return torch.float
+        if p == "fp16":
+            return torch.float16
+        if p == "bf16":
+            return torch.bfloat16
+        return None
+    assert v2 != sdxl or (not v2 and not sdxl), "v2 and sdxl cannot be specified at the same time / v2とsdxlは同時に指定できません"
+    if v_parameterization is None:
+        v_parameterization = v2
+    load_dtype = str_to_dtype(load_precision) if load_precision else None
+    save_dtype = str_to_dtype(save_precision)
+    work_device = "cpu"
+    # load models
+    if not sdxl:
+        logger.info(f"loading original SD model : {model_org}")
+        text_encoder_o, _, unet_o = model_util.load_models_from_stable_diffusion_checkpoint(v2, model_org)
+        text_encoders_o = [text_encoder_o]
+        if load_dtype is not None:
+            text_encoder_o = text_encoder_o.to(load_dtype)
+            unet_o = unet_o.to(load_dtype)
+        logger.info(f"loading tuned SD model : {model_tuned}")
+        text_encoder_t, _, unet_t = model_util.load_models_from_stable_diffusion_checkpoint(v2, model_tuned)
+        text_encoders_t = [text_encoder_t]
+        if load_dtype is not None:
+            text_encoder_t = text_encoder_t.to(load_dtype)
+            unet_t = unet_t.to(load_dtype)
+        model_version = model_util.get_model_version_str_for_sd1_sd2(v2, v_parameterization)
+    else:
+        device_org = load_original_model_to if load_original_model_to else "cpu"
+        device_tuned = load_tuned_model_to if load_tuned_model_to else "cpu"
+        logger.info(f"loading original SDXL model : {model_org}")
+        text_encoder_o1, text_encoder_o2, _, unet_o, _, _ = sdxl_model_util.load_models_from_sdxl_checkpoint(
+            sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, model_org, device_org
+        )
+        text_encoders_o = [text_encoder_o1, text_encoder_o2]
+        if load_dtype is not None:
+            text_encoder_o1 = text_encoder_o1.to(load_dtype)
+            text_encoder_o2 = text_encoder_o2.to(load_dtype)
+            unet_o = unet_o.to(load_dtype)
+        logger.info(f"loading original SDXL model : {model_tuned}")
+        text_encoder_t1, text_encoder_t2, _, unet_t, _, _ = sdxl_model_util.load_models_from_sdxl_checkpoint(
+            sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, model_tuned, device_tuned
+        )
+        text_encoders_t = [text_encoder_t1, text_encoder_t2]
+        if load_dtype is not None:
+            text_encoder_t1 = text_encoder_t1.to(load_dtype)
+            text_encoder_t2 = text_encoder_t2.to(load_dtype)
+            unet_t = unet_t.to(load_dtype)
+        model_version = sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0
+    # create LoRA network to extract weights: Use dim (rank) as alpha
+    if conv_dim is None:
+        kwargs = {}
+    else:
+        kwargs = {"conv_dim": conv_dim, "conv_alpha": conv_dim}
+    lora_network_o = lora.create_network(1.0, dim, dim, None, text_encoders_o, unet_o, **kwargs)
+    lora_network_t = lora.create_network(1.0, dim, dim, None, text_encoders_t, unet_t, **kwargs)
+    assert len(lora_network_o.text_encoder_loras) == len(
+        lora_network_t.text_encoder_loras
+    ), f"model version is different (SD1.x vs SD2.x) / それぞれのモデルのバージョンが違います（SD1.xベースとSD2.xベース） "
+    # get diffs
+    diffs = {}
+    text_encoder_different = False
+    for i, (lora_o, lora_t) in enumerate(zip(lora_network_o.text_encoder_loras, lora_network_t.text_encoder_loras)):
+        lora_name = lora_o.lora_name
+        module_o = lora_o.org_module
+        module_t = lora_t.org_module
+        diff = module_t.weight.to(work_device) - module_o.weight.to(work_device)
+        # clear weight to save memory
+        module_o.weight = None
+        module_t.weight = None
+        # Text Encoder might be same
+        if not text_encoder_different and torch.max(torch.abs(diff)) > min_diff:
+            text_encoder_different = True
+            logger.info(f"Text encoder is different. {torch.max(torch.abs(diff))} > {min_diff}")
+        diffs[lora_name] = diff
+    # clear target Text Encoder to save memory
+    for text_encoder in text_encoders_t:
+        del text_encoder
+    if not text_encoder_different:
+        logger.warning("Text encoder is same. Extract U-Net only.")
+        lora_network_o.text_encoder_loras = []
+        diffs = {}  # clear diffs
+    for i, (lora_o, lora_t) in enumerate(zip(lora_network_o.unet_loras, lora_network_t.unet_loras)):
+        lora_name = lora_o.lora_name
+        module_o = lora_o.org_module
+        module_t = lora_t.org_module
+        diff = module_t.weight.to(work_device) - module_o.weight.to(work_device)
+        # clear weight to save memory
+        module_o.weight = None
+        module_t.weight = None
+        diffs[lora_name] = diff
+    # clear LoRA network, target U-Net to save memory
+    del lora_network_o
+    del lora_network_t
+    del unet_t
+    # make LoRA with svd
+    logger.info("calculating by svd")
+    lora_weights = {}
+    with torch.no_grad():
+        for lora_name, mat in tqdm(list(diffs.items())):
+            if args.device:
+                mat = mat.to(args.device)
+            mat = mat.to(torch.float)  # calc by float
+            # if conv_dim is None, diffs do not include LoRAs for conv2d-3x3
+            conv2d = len(mat.size()) == 4
+            kernel_size = None if not conv2d else mat.size()[2:4]
+            conv2d_3x3 = conv2d and kernel_size != (1, 1)
+            rank = dim if not conv2d_3x3 or conv_dim is None else conv_dim
+            out_dim, in_dim = mat.size()[0:2]
+            if device:
+                mat = mat.to(device)
+            # logger.info(lora_name, mat.size(), mat.device, rank, in_dim, out_dim)
+            rank = min(rank, in_dim, out_dim)  # LoRA rank cannot exceed the original dim
+            if conv2d:
+                if conv2d_3x3:
+                    mat = mat.flatten(start_dim=1)
+                else:
+                    mat = mat.squeeze()
+            U, S, Vh = torch.linalg.svd(mat)
+            U = U[:, :rank]
+            S = S[:rank]
+            U = U @ torch.diag(S)
+            Vh = Vh[:rank, :]
+            dist = torch.cat([U.flatten(), Vh.flatten()])
+            hi_val = torch.quantile(dist, clamp_quantile)
+            low_val = -hi_val
+            U = U.clamp(low_val, hi_val)
+            Vh = Vh.clamp(low_val, hi_val)
+            if conv2d:
+                U = U.reshape(out_dim, rank, 1, 1)
+                Vh = Vh.reshape(rank, in_dim, kernel_size[0], kernel_size[1])
+            U = U.to(work_device, dtype=save_dtype).contiguous()
+            Vh = Vh.to(work_device, dtype=save_dtype).contiguous()
+            lora_weights[lora_name] = (U, Vh)
+    # make state dict for LoRA
+    lora_sd = {}
+    for lora_name, (up_weight, down_weight) in lora_weights.items():
+        lora_sd[lora_name + ".lora_up.weight"] = up_weight
+        lora_sd[lora_name + ".lora_down.weight"] = down_weight
+        lora_sd[lora_name + ".alpha"] = torch.tensor(down_weight.size()[0])
+    # load state dict to LoRA and save it
+    lora_network_save, lora_sd = lora.create_network_from_weights(1.0, None, None, text_encoders_o, unet_o, weights_sd=lora_sd)
+    lora_network_save.apply_to(text_encoders_o, unet_o)  # create internal module references for state_dict
+    info = lora_network_save.load_state_dict(lora_sd)
+    logger.info(f"Loading extracted LoRA weights: {info}")
+    dir_name = os.path.dirname(save_to)
+    if dir_name and not os.path.exists(dir_name):
+        os.makedirs(dir_name, exist_ok=True)
+    # minimum metadata
+    net_kwargs = {}
+    if conv_dim is not None:
+        net_kwargs["conv_dim"] = str(conv_dim)
+        net_kwargs["conv_alpha"] = str(float(conv_dim))
+    metadata = {
+        "ss_v2": str(v2),
+        "ss_base_model_version": model_version,
+        "ss_network_module": "networks.lora",
+        "ss_network_dim": str(dim),
+        "ss_network_alpha": str(float(dim)),
+        "ss_network_args": json.dumps(net_kwargs),
+    }
+    if not no_metadata:
+        title = os.path.splitext(os.path.basename(save_to))[0]
+        sai_metadata = sai_model_spec.build_metadata(None, v2, v_parameterization, sdxl, True, False, time.time(), title=title)
+        metadata.update(sai_metadata)
+    lora_network_save.save_weights(save_to, save_dtype, metadata)
+    logger.info(f"LoRA weights are saved to: {save_to}")
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--v2", action="store_true", help="load Stable Diffusion v2.x model / Stable Diffusion 2.xのモデルを読み込む")
+    parser.add_argument(
+        "--v_parameterization",
+        action="store_true",
+        default=None,
+        help="make LoRA metadata for v-parameterization (default is same to v2) / 作成するLoRAのメタデータにv-parameterization用と設定する（省略時はv2と同じ）",
+    )
+    parser.add_argument(
+        "--sdxl", action="store_true", help="load Stable Diffusion SDXL base model / Stable Diffusion SDXL baseのモデルを読み込む"
+    )
+    parser.add_argument(
+        "--load_precision",
+        type=str,
+        default=None,
+        choices=[None, "float", "fp16", "bf16"],
+        help="precision in loading, model default if omitted / 読み込み時に精度を変更して読み込む、省略時はモデルファイルによる"
+    )
+    parser.add_argument(
+        "--save_precision",
+        type=str,
+        default=None,
+        choices=[None, "float", "fp16", "bf16"],
+        help="precision in saving, same to merging if omitted / 保存時に精度を変更して保存する、省略時はfloat",
+    )
+    parser.add_argument(
+        "--model_org",
+        type=str,
+        default=None,
+        required=True,
+        help="Stable Diffusion original model: ckpt or safetensors file / 元モデル、ckptまたはsafetensors",
+    )
+    parser.add_argument(
+        "--model_tuned",
+        type=str,
+        default=None,
+        required=True,
+        help="Stable Diffusion tuned model, LoRA is difference of `original to tuned`: ckpt or safetensors file / 派生モデル（生成されるLoRAは元→派生の差分になります）、ckptまたはsafetensors",
+    )
+    parser.add_argument(
+        "--save_to",
+        type=str,
+        default=None,
+        required=True,
+        help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors",
+    )
+    parser.add_argument("--dim", type=int, default=4, help="dimension (rank) of LoRA (default 4) / LoRAの次元数（rank）（デフォルト4）")
+    parser.add_argument(
+        "--conv_dim",
+        type=int,
+        default=None,
+        help="dimension (rank) of LoRA for Conv2d-3x3 (default None, disabled) / LoRAのConv2d-3x3の次元数（rank）（デフォルトNone、適用なし）",
+    )
+    parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う")
+    parser.add_argument(
+        "--clamp_quantile",
+        type=float,
+        default=0.99,
+        help="Quantile clamping value, float, (0-1). Default = 0.99 / 値をクランプするための分位点、float、(0-1)。デフォルトは0.99",
+    )
+    parser.add_argument(
+        "--min_diff",
+        type=float,
+        default=0.01,
+        help="Minimum difference between finetuned model and base to consider them different enough to extract, float, (0-1). Default = 0.01 /"
+        + "LoRAを抽出するために元モデルと派生モデルの差分の最小値、float、(0-1)。デフォルトは0.01",
+    )
+    parser.add_argument(
+        "--no_metadata",
+        action="store_true",
+        help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / "
+        + "sai modelspecのメタデータを保存しない（LoRAの最低限のss_metadataは保存される）",
+    )
+    parser.add_argument(
+        "--load_original_model_to",
+        type=str,
+        default=None,
+        help="location to load original model, cpu or cuda, cuda:0, etc, default is cpu, only for SDXL / 元モデル読み込み先、cpuまたはcuda、cuda:0など、省略時はcpu、SDXLのみ有効",
+    )
+    parser.add_argument(
+        "--load_tuned_model_to",
+        type=str,
+        default=None,
+        help="location to load tuned model, cpu or cuda, cuda:0, etc, default is cpu, only for SDXL / 派生モデル読み込み先、cpuまたはcuda、cuda:0など、省略時はcpu、SDXLのみ有効",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    svd(**vars(args))

fine_tune_README_ja.md ADDED Viewed

	@@ -0,0 +1,140 @@

+NovelAIの提案した学習手法、自動キャプションニング、タグ付け、Windows＋VRAM 12GB（SD v1.xの場合）環境等に対応したfine tuningです。ここでfine tuningとは、モデルを画像とキャプションで学習することを指します（LoRAやTextual Inversion、Hypernetworksは含みません）
+[学習についての共通ドキュメント](./train_README-ja.md) もあわせてご覧ください。
+# 概要
+Diffusersを用いてStable DiffusionのU-Netのfine tuningを行います。NovelAIの記事にある以下の改善に対応しています（Aspect Ratio BucketingについてはNovelAIのコードを参考にしましたが、最終的なコードはすべてオリジナルです）。
+* CLIP（Text Encoder）の最後の層ではなく最後から二番目の層の出力を用いる。
+* 正方形以外の解像度での学習（Aspect Ratio Bucketing） 。
+* トークン長を75から225に拡張する。
+* BLIPによるキャプショニング（キャプションの自動作成）、DeepDanbooruまたはWD14Taggerによる自動タグ付けを行う。
+* Hypernetworkの学習にも対応する。
+* Stable Diffusion v2.0（baseおよび768/v）に対応。
+* VAEの出力をあらかじめ取得しディスクに保存しておくことで、学習の省メモリ化、高速化を図る。
+デフォルトではText Encoderの学習は行いません。モデル全体のfine tuningではU-Netだけを学習するのが一般的なようです（NovelAIもそのようです）。オプション指定でText Encoderも学習対象とできます。
+# 追加機能について
+## CLIPの出力の変更
+プロンプトを画像に反映するため、テキストの特徴量への変換を行うのがCLIP（Text Encoder）です。Stable DiffusionではCLIPの最後の層の出力を用いていますが、それを最後から二番目の層の出力を用いるよう変更できます。NovelAIによると、これによりより正確にプロンプトが反映されるようになるとのことです。
+元のまま、最後の層の出力を用いることも可能です。
+※Stable Diffusion 2.0では最後から二番目の層をデフォルトで使います。clip_skipオプションを指定しないでください。
+## 正方形以外の解像度での学習
+Stable Diffusionは512\*512で学習されていますが、それに加えて256\*1024や384\*640といった解像度でも学習します。これによりトリミングされる部分が減り、より正しくプロンプトと画像の関係が学習されることが期待されます。
+学習解像度はパラメータとして与えられた解像度の面積（＝メモリ使用量）を超えない範囲で、64ピクセル単位で縦横に調整、作成されます。
+機械学習では入力サイズをすべて統一するのが一般的ですが、特に制約があるわけではなく、実際は同一のバッチ内で統一されていれば大丈夫です。NovelAIの言うbucketingは、あらかじめ教師データを、アスペクト比に応じた学習解像度ごとに分類しておくことを指しているようです。そしてバッチを各bucket内の画像で作成することで、バッチの画像サイズを統一します。
+## トークン長の75から225への拡張
+Stable Diffusionでは最大75トークン（開始・終了を含むと77トークン）ですが、それを225トークンまで拡張します。
+ただしCLIPが受け付ける最大長は75トークンですので、225トークンの場合、単純に三分割してCLIPを呼び出してから結果を連結しています。
+※これが望ましい実装なのかどうかはいまひとつわかりません。とりあえず動いてはいるようです。特に2.0では何も参考になる実装がないので独自に実装してあります。
+※Automatic1111氏のWeb UIではカンマを意識して分割、といったこともしているようですが、私の場合はそこまでしておらず単純な分割です。
+# 学習の手順
+あらかじめこのリポジトリのREADMEを参照し、環境整備を行ってください。
+## データの準備
+[学習データの準備について](./train_README-ja.md) を参照してください。fine tuningではメタデータを用いるfine tuning方式のみ対応しています。
+## 学習の実行
+たとえば以下のように実行します。以下は省メモリ化のための設定です。それぞれの行を必要に応じて書き換えてください。
+```
+accelerate launch --num_cpu_threads_per_process 1 fine_tune.py
+    --pretrained_model_name_or_path=<.ckptまたは.safetensordまたはDiffusers版モデルのディレクトリ>
+    --output_dir=<学習したモデルの出力先フォルダ>
+    --output_name=<学習したモデル出力時のファイル名>
+    --dataset_config=<データ準備で作成した.tomlファイル>
+    --save_model_as=safetensors
+    --learning_rate=5e-6 --max_train_steps=10000
+    --use_8bit_adam --xformers --gradient_checkpointing
+    --mixed_precision=fp16
+```
+`num_cpu_threads_per_process` には通常は1を指定するとよいようです。
+`pretrained_model_name_or_path` に追加学習を行う元となるモデルを指定します。Stable Diffusionのcheckpointファイル（.ckptまたは.safetensors）、Diffusersのローカルディスクにあるモデルディレクトリ、DiffusersのモデルID（"stabilityai/stable-diffusion-2"など）が指定できます。
+`output_dir` に学習後のモデルを保存するフォルダを指定します。`output_name` にモデルのファイル名を拡張子を除いて指定します。`save_model_as` でsafetensors形式での保存を指定しています。
+`dataset_config` に `.toml` ファイルを指定します。ファイル内でのバッチサイズ指定は、当初はメモリ消費を抑えるために `1` としてください。
+学習させるステップ数 `max_train_steps` を10000とします。学習率 `learning_rate` はここでは5e-6を指定しています。
+省メモリ化のため `mixed_precision="fp16"` を指定します（RTX30 シリーズ以降では `bf16` も指定できます。環境整備時にaccelerateに行った設定と合わせてください）。また `gradient_checkpointing` を指定します。
+オプティマイザ（モデルを学習データにあうように最適化＝学習させるクラス）にメモリ消費の少ない 8bit AdamW を使うため、 `optimizer_type="AdamW8bit"` を指定します。
+`xformers` オプションを指定し、xformersのCrossAttentionを用います。xformersをインストールしていない場合やエラーとなる場合（環境にもよりますが `mixed_precision="no"` の場合など）、代わりに `mem_eff_attn` オプションを指定すると省メモリ版CrossAttentionを使用します（速度は遅くなります）。
+ある程度メモリがある場合は、`.toml` ファイルを編集してバッチサイズをたとえば `4` くらいに増やしてください（高速化と精度向上の可能性があります）。
+### よく使われるオプションについて
+以下の場合にはオプションに関するドキュメントを参照してください。
+- Stable Diffusion 2.xまたはそこからの派生モデルを学習する
+- clip skipを2以上を前提としたモデルを学習する
+- 75トークンを超えたキャプションで学習する
+### バッチサイズについて
+モデル全体を学習するためLoRA等の学習に比べるとメモリ消費量は多くなります（DreamBoothと同じ）。
+### 学習率について
+1e-6から5e-6程度が一般的なようです。他のfine tuningの例なども参照してみてください。
+### 以前の形式のデータセット指定をした場合のコマンドライン
+解像度やバッチサイズをオプションで指定します。コマンドラインの例は以下の通りです。
+```
+accelerate launch --num_cpu_threads_per_process 1 fine_tune.py
+    --pretrained_model_name_or_path=model.ckpt
+    --in_json meta_lat.json
+    --train_data_dir=train_data
+    --output_dir=fine_tuned
+    --shuffle_caption
+    --train_batch_size=1 --learning_rate=5e-6 --max_train_steps=10000
+    --use_8bit_adam --xformers --gradient_checkpointing
+    --mixed_precision=bf16
+    --save_every_n_epochs=4
+```
+<!--
+### 勾配をfp16とした学習（実験的機能）
+full_fp16オプションを指定すると勾配を通常のfloat32からfloat16（fp16）に変更して学習します（mixed precisionではなく完全なfp16学習になるようです）。これによりSD1.xの512*512サイズでは8GB未満、SD2.xの512*512サイズで12GB未満のVRAM使用量で学習できるようです。
+あらかじめaccelerate configでfp16を指定し、オプションでmixed_precision="fp16"としてください（bf16では動作しません）。
+メモリ使用量を最小化するためには、xformers、use_8bit_adam、gradient_checkpointingの各オプションを指定し、train_batch_sizeを1としてください。
+（余裕があるようならtrain_batch_sizeを段階的に増やすと若干精度が上がるはずです。）
+PyTorchのソースにパッチを当てて無理やり実現しています（PyTorch 1.12.1と1.13.0で確認）。精度はかなり落ちますし、途中で学習失敗する確率も高くなります。学習率やステップ数の設定もシビアなようです。それらを認識したうえで自己責任でお使いください。
+-->
+# fine tuning特有のその他の主なオプション
+すべてのオプションについては別文書を参照してください。
+## `train_text_encoder`
+Text Encoderも学習対象とします。メモリ使用量が若干増加します。
+通常のfine tuningではText Encoderは学習対象としませんが（恐らくText Encoderの出力に従うようにU-Netを学習するため）、学習データ数が少ない場合には、DreamBoothのようにText Encoder側に学習させるのも有効的なようです。
+## `diffusers_xformers`
+スクリプト独自のxformers置換機能ではなくDiffusersのxformers機能を利用します。Hypernetworkの学習はできなくなります。

gen_img_README-ja.md ADDED Viewed

	@@ -0,0 +1,487 @@

+SD 1.xおよび2.xのモデル、当リポジトリで学習したLoRA、ControlNet（v1.0のみ動作確認）などに対応した、Diffusersベースの推論（画像生成）スクリプトです。コマンドラインから用います。
+# 概要
+* Diffusers (v0.10.2) ベースの推論（画像生成）スクリプト。
+* SD 1.xおよび2.x (base/v-parameterization)モデルに対応。
+* txt2img、img2img、inpaintingに対応。
+* 対話モード、およびファイルからのプロンプト読み込み、連続生成に対応。
+* プロンプト1行あたりの生成枚数を指定可能。
+* 全体の繰り返し回数を指定可能。
+* `fp16`だけでなく`bf16`にも対応。
+* xformersに対応し高速生成が可能。
+    * xformersにより省メモリ生成を行いますが、Automatic 1111氏のWeb UIほど最適化していないため、512*512の画像生成でおおむね6GB程度のVRAMを使用します。
+* プロンプトの225トークンへの拡張。ネガティブプロンプト、重みづけに対応。
+* Diffusersの各種samplerに対応（Web UIよりもsampler数は少ないです）。
+* Text Encoderのclip skip（最後からn番目の層の出力を用いる）に対応。
+* VAEの別途読み込み。
+* CLIP Guided Stable Diffusion、VGG16 Guided Stable Diffusion、Highres. fix、upscale対応。
+    * Highres. fixはWeb UIの実装を全く確認していない独自実装のため、出力結果は異なるかもしれません。
+* LoRA対応。適用率指定、複数LoRA同時利用、重みのマージに対応。
+    * Text EncoderとU-Netで別の適用率を指定することはできません。
+* Attention Coupleに対応。
+* ControlNet v1.0に対応。
+* 途中でモデルを切り替えることはできませんが、バッチファイルを組むことで対応できます。
+* 個人的に欲しくなった機能をいろいろ追加。
+機能追加時にすべてのテストを行っているわけではないため、以前の機能に影響が出て一部機能が動かない可能性があります。何か問題があればお知らせください。
+# 基本的な使い方
+## 対話モードでの画像生成
+以下のように入力してください。
+```batchfile
+python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先> --xformers --fp16 --interactive
+```
+`--ckpt`オプションにモデル（Stable Diffusionのcheckpointファイル、またはDiffusersのモデルフォルダ）、`--outdir`オプションに画像の出力先フォルダを指定します。
+`--xformers`オプションでxformersの使用を指定します（xformersを使わない場合は外してください）。`--fp16`オプションでfp16（単精度）での推論を行います。RTX 30系のGPUでは `--bf16`オプションでbf16（bfloat16）での推論を行うこともできます。
+`--interactive`オプションで対話モードを指定しています。
+Stable Diffusion 2.0（またはそこからの追加学習モデル）を使う場合は`--v2`オプションを追加してください。v-parameterizationを使うモデル（`768-v-ema.ckpt`およびそこからの追加学習モデル）を使う場合はさらに`--v_parameterization`を追加してください。
+`--v2`の指定有無が間違っているとモデル読み込み時にエラーになります。`--v_parameterization`の指定有無が間違っていると茶色い画像が表示されます。
+`Type prompt:`と表示されたらプロンプトを入力してください。
+![image](https://user-images.githubusercontent.com/52813779/235343115-f3b8ac82-456d-4aab-9724-0cc73c4534aa.png)
+※画像が表示されずエラーになる場合、headless（画面表示機能なし）のOpenCVがインストールされているかもしれません。`pip install opencv-python`として通常のOpenCVを入れてください。または`--no_preview`オプションで画像表示を止めてください。
+画像ウィンドウを選択してから何らかのキーを押すとウィンドウが閉じ、次のプロンプトが入力できます。プロンプトでCtrl+Z、エンターの順に打鍵するとスクリプトを閉じます。
+## 単一のプロンプトで画像を一括生成
+以下のように入力します（実際には1行で入力します）。
+```batchfile
+python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先>
+    --xformers --fp16 --images_per_prompt <生成枚数> --prompt "<プロンプト>"
+```
+`--images_per_prompt`オプションで、プロンプト1件当たりの生成枚数を指定します。`--prompt`オプションでプロンプトを指定します。スペースを含む場合はダブルクォーテーションで囲んでください。
+`--batch_size`オプションでバッチサイズを指定できます（後述）。
+## ファイルからプロンプトを読み込み一括生成
+以下のように入力します。
+```batchfile
+python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先>
+    --xformers --fp16 --from_file <プロンプトファイル名>
+```
+`--from_file`オプションで、プロンプトが記述されたファイルを指定します。1行1プロンプトで記述してください。`--images_per_prompt`オプションを指定して1行あたり生成枚数を指定できます。
+## ネガティブプロンプト、重みづけの使用
+プロンプトオプション（プロンプト内で`--x`のように指定、後述）で`--n`を書くと、以降がネガティブプロンプトとなります。
+またAUTOMATIC1111氏のWeb UIと同様の `()` や` []` 、`(xxx:1.3)` などによる重みづけが可能です（実装はDiffusersの[Long Prompt Weighting Stable Diffusion](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#long-prompt-weighting-stable-diffusion)からコピーしたものです）。
+コマンドラインからのプロンプト指定、ファイルからのプロンプト読み込みでも同様に指定できます。
+![image](https://user-images.githubusercontent.com/52813779/235343128-e79cd768-ec59-46f5-8395-fce9bdc46208.png)
+# 主なオプション
+コマンドラインから指定してください。
+## モデルの指定
+- `--ckpt <モデル名>`：モデル名を指定します。`--ckpt`オプションは必須です。Stable Diffusionのcheckpointファイル、またはDiffusersのモデルフォルダ、Hugging FaceのモデルIDを指定できます。
+- `--v2`：Stable Diffusion 2.x系のモデルを使う場合に指定します。1.x系の場合には指定不要です。
+- `--v_parameterization`：v-parameterizationを使うモデルを使う場合に指定します（`768-v-ema.ckpt`およびそこからの追加学習モデル、Waifu Diffusion v1.5など）。
+    `--v2`の指定有無が間違っているとモデル読み込み時にエラーになります。`--v_parameterization`の指定有無が間違っていると茶色い画像が表示されます。
+- `--vae`：使用するVAEを指定します。未指定時はモデル内のVAEを使用します。
+## 画像生成と出力
+- `--interactive`：インタラクティブモードで動作します。プロンプトを入力すると画像が生成されます。
+- `--prompt <プロンプト>`：プロンプトを指定します。スペースを含む場合はダブルクォーテーションで囲んでください。
+- `--from_file <プロンプトファイル名>`：プロンプトが記述されたファイルを指定します。1行1プロンプトで記述してください。なお画像サイズやguidance scaleはプロンプトオプション（後述）で指定できます。
+- `--W <画像幅>`：画像の幅を指定します。デフォルトは`512`です。
+- `--H <画像高さ>`：画像の高さを指定します。デフォルトは`512`です。
+- `--steps <ステップ数>`：サンプリングステップ数を指定します。デフォルトは`50`です。
+- `--scale <ガイダンススケール>`：unconditionalガイダンススケールを指定します。デフォルトは`7.5`です。
+- `--sampler <サンプラー名>`：サンプラーを指定します。デフォルトは`ddim`です。Diffusersで提供されているddim、pndm、dpmsolver、dpmsolver+++、lms、euler、euler_a、が指定可能です（後ろの三つはk_lms、k_euler、k_euler_aでも指定できます）。
+- `--outdir <画像出力先フォルダ>`：画像の出力先を指定します。
+- `--images_per_prompt <生成枚数>`：プロンプト1件当たりの生成枚数を指定します。デフォルトは`1`です。
+- `--clip_skip <スキップ数>`：CLIPの後ろから何番目の層を使うかを指定します。省略時は最後の層を使います。
+- `--max_embeddings_multiples <倍数>`：CLIPの入出力長をデフォルト（75）の何倍にするかを指定します。未指定時は75のままです。たとえば3を指定すると入出力長が225になります。
+- `--negative_scale` : uncoditioningのguidance scaleを個別に指定します。[gcem156氏のこちらの記事](https://note.com/gcem156/n/ne9a53e4a6f43)を参考に実装したものです。
+## メモリ使用量や生成速度の調整
+- `--batch_size <バッチサイズ>`：バッチサイズを指定します。デフォルトは`1`です。バッチサイズが大きいとメモリを多く消費しますが、生成速度が速くなります。
+- `--vae_batch_size <VAEのバッチサイズ>`：VAEのバッチサイズを指定します。デフォルトはバッチサイズと同じです。
+    VAEのほうがメモリを多く消費するため、デノイジング後（stepが100%になった後）でメモリ不足になる場合があります。このような場合にはVAEのバッチサイズを小さくしてくだ���い。
+- `--xformers`：xformersを使う場合に指定します。
+- `--fp16`：fp16（単精度）での推論を行います。`fp16`と`bf16`をどちらも指定しない場合はfp32（単精度）での推論を行います。
+- `--bf16`：bf16（bfloat16）での推論を行います。RTX 30系のGPUでのみ指定可能です。`--bf16`オプションはRTX 30系以外のGPUではエラーになります。`fp16`よりも`bf16`のほうが推論結果がNaNになる（真っ黒の画像になる）可能性が低いようです。
+## 追加ネットワーク（LoRA等）の使用
+- `--network_module`：使用する追加ネットワークを指定します。LoRAの場合は`--network_module networks.lora`と指定します。複数のLoRAを使用する場合は`--network_module networks.lora networks.lora networks.lora`のように指定します。
+- `--network_weights`：使用する追加ネットワークの重みファイルを指定します。`--network_weights model.safetensors`のように指定します。複数のLoRAを使用する場合は`--network_weights model1.safetensors model2.safetensors model3.safetensors`のように指定します。引数の数は`--network_module`で指定した数と同じにしてください。
+- `--network_mul`：使用する追加ネットワークの重みを何倍にするかを指定します。デフォルトは`1`です。`--network_mul 0.8`のように指定します。複数のLoRAを使用する場合は`--network_mul 0.4 0.5 0.7`のように指定します。引数の数は`--network_module`で指定した数と同じにしてください。
+- `--network_merge`：使用する追加ネットワークの重みを`--network_mul`に指定した重みであらかじめマージします。`--network_pre_calc` と同時に使用できません。プロンプトオプションの`--am`、およびRegional LoRAは使用できなくなりますが、LoRA未使用時と同じ程度まで生成が高速化されます。
+- `--network_pre_calc`：使用する追加ネットワークの重みを生成ごとにあらかじめ計算します。プロンプトオプションの`--am`が使用できます。LoRA未使用時と同じ程度まで生成は高速化されますが、生成前に重みを計算する時間が必要で、またメモリ使用量も若干増加します。Regional LoRA使用時は無効になります 。
+# 主なオプションの指定例
+次は同一プロンプトで64枚をバッチサイズ4で一括生成する例です。
+```batchfile
+python gen_img_diffusers.py --ckpt model.ckpt --outdir outputs
+    --xformers --fp16 --W 512 --H 704 --scale 12.5 --sampler k_euler_a
+    --steps 32 --batch_size 4 --images_per_prompt 64
+    --prompt "beautiful flowers --n monochrome"
+```
+次はファイルに書かれたプロンプトを、それぞれ10枚ずつ、バッチサイズ4で一括生成する例です。
+```batchfile
+python gen_img_diffusers.py --ckpt model.ckpt --outdir outputs
+    --xformers --fp16 --W 512 --H 704 --scale 12.5 --sampler k_euler_a
+    --steps 32 --batch_size 4 --images_per_prompt 10
+    --from_file prompts.txt
+```
+Textual Inversion（後述）およびLoRAの使用例です。
+```batchfile
+python gen_img_diffusers.py --ckpt model.safetensors
+    --scale 8 --steps 48 --outdir txt2img --xformers
+    --W 512 --H 768 --fp16 --sampler k_euler_a
+    --textual_inversion_embeddings goodembed.safetensors negprompt.pt
+    --network_module networks.lora networks.lora
+    --network_weights model1.safetensors model2.safetensors
+    --network_mul 0.4 0.8
+    --clip_skip 2 --max_embeddings_multiples 1
+    --batch_size 8 --images_per_prompt 1 --interactive
+```
+# プロンプトオプション
+プロンプト内で、`--n`のように「ハイフンふたつ+アルファベットn文字」でプロンプトから各種オプションの指定が可能です。対話モード、コマンドライン、ファイル、いずれからプロンプトを指定する場合でも有効です。
+プロンプトのオプション指定`--n`の前後にはスペースを入れてください。
+- `--n`：ネガティブプロンプトを指定します。
+- `--w`：画像幅を指定します。コマンドラインからの指定を上書きします。
+- `--h`：画像高さを指定します。コマンドラインからの指定を上書きします。
+- `--s`：ステップ数を指定します。コマンドラインからの指定を上書きします。
+- `--d`：この画像の乱数seedを指定します。`--images_per_prompt`を指定している場合は「--d 1,2,3,4」のようにカンマ区切りで複数指定してください。
+    ※様々な理由により、Web UIとは同じ乱数seedでも生成される画像が異なる場合があります。
+- `--l`：guidance scaleを指定します。コマンドラインからの指定を上書きします。
+- `--t`：img2img（後述）のstrengthを指定します。コ��ンドラインからの指定を上書きします。
+- `--nl`：ネガティブプロンプトのguidance scaleを指定します（後述）。コマンドラインからの指定を上書きします。
+- `--am`：追加ネットワークの重みを指定します。コマンドラインからの指定を上書きします。複数の追加ネットワークを使用する場合は`--am 0.8,0.5,0.3`のように __カンマ区切りで__ 指定します。
+※これらのオプションを指定すると、バッチサイズよりも小さいサイズでバッチが実行される場合があります（これらの値が異なると一括生成できないため）。（あまり気にしなくて大丈夫ですが、ファイルからプロンプトを読み込み生成する場合は、これらの値が同一のプロンプトを並べておくと効率が良くなります。）
+例：
+```
+(masterpiece, best quality), 1girl, in shirt and plated skirt, standing at street under cherry blossoms, upper body, [from below], kind smile, looking at another, [goodembed] --n realistic, real life, (negprompt), (lowres:1.1), (worst quality:1.2), (low quality:1.1), bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, normal quality, jpeg artifacts, signature, watermark, username, blurry --w 960 --h 640 --s 28 --d 1
+```
+![image](https://user-images.githubusercontent.com/52813779/235343446-25654172-fff4-4aaf-977a-20d262b51676.png)
+# img2img
+## オプション
+- `--image_path`：img2imgに利用する画像を指定します。`--image_path template.png`のように指定します。フォルダを指定すると、そのフォルダの画像を順次利用します。
+- `--strength`：img2imgのstrengthを指定します。`--strength 0.8`のように指定します。デフォルトは`0.8`です。
+- `--sequential_file_name`：ファイル名を連番にするかどうかを指定します。指定すると生成されるファイル名が`im_000001.png`からの連番になります。
+- `--use_original_file_name`：指定すると生成ファイル名がオリジナルのファイル名と同じになります。
+## コマンドラインからの実行例
+```batchfile
+python gen_img_diffusers.py --ckpt trinart_characters_it4_v1_vae_merged.ckpt
+    --outdir outputs --xformers --fp16 --scale 12.5 --sampler k_euler --steps 32
+    --image_path template.png --strength 0.8
+    --prompt "1girl, cowboy shot, brown hair, pony tail, brown eyes,
+          sailor school uniform, outdoors
+          --n lowres, bad anatomy, bad hands, error, missing fingers, cropped,
+          worst quality, low quality, normal quality, jpeg artifacts, (blurry),
+          hair ornament, glasses"
+    --batch_size 8 --images_per_prompt 32
+```
+`--image_path`オプションにフォルダを指定すると、そのフォルダの画像を順次読み込みます。生成される枚数は画像枚数ではなく、プロンプト数になりますので、`--images_per_promptPPオプションを指定してimg2imgする画像の枚数とプロンプト数を合わせてください。
+ファイルはファイル名でソートして読み込みます。なおソート順は文字列順となりますので（`1.jpg→2.jpg→10.jpg`ではなく`1.jpg→10.jpg→2.jpg`の順）、頭を0埋めするなどしてご対応ください（`01.jpg→02.jpg→10.jpg`）。
+## img2imgを利用したupscale
+img2img時にコマンドラインオプションの`--W`と`--H`で生成画像サイズを指定すると、元画像をそのサイズにリサイズしてからimg2imgを行います。
+またimg2imgの元画像がこのスクリプトで生成した画像の場合、プロンプトを省略すると、元画像のメタデータからプロンプトを取得しそのまま用います。これによりHighres. fixの2nd stageの動作だけを行うことができます。
+## img2img時のinpainting
+画像およびマスク画像を指定してinpaintingできます（inpaintingモデルには対応しておらず、単にマスク領域を対象にimg2imgするだけです）。
+オプションは以下の通りです。
+- `--mask_image`：マスク画像を指定します。`--img_path`と同様にフォルダを指定すると、そのフォルダの画像を順次利用します。
+マスク画像はグレースケール画像で、白の部分がinpaintingされます。境界をグラデーションしておくとなんとなく滑らかになりますのでお勧めです。
+![image](https://user-images.githubusercontent.com/52813779/235343795-9eaa6d98-02ff-4f32-b089-80d1fc482453.png)
+# その他の機能
+## Textual Inversion
+`--textual_inversion_embeddings`オプションで使用するembeddingsを指定します（複数指定可）。拡張子を除いたファイル名をプロンプト内で使用することで、そのembeddingsを利用します（Web UIと同様の使用法です）。ネガティブプロンプト内でも使用できます。
+モデルとして、当リポジトリで学習したTextual Inversionモデル、およびWeb UIで学習したTextual Inversionモデル（画像埋め込みは非対応）を利用できます
+## Extended Textual Inversion
+`--textual_inversion_embeddings`の代わりに`--XTI_embeddings`オプションを指定してください。使用法は`--textual_inversion_embeddings`と同じです。
+## Highres. fix
+AUTOMATIC1111氏のWeb UIにある機能の類似機能です（独自実装のためもしかしたらいろいろ異なるかもしれません）。最初に小さめの画像を生成し、その画像を元にimg2imgすることで、画像全体の破綻を防ぎつつ大きな解像度の画像を生成します。
+2nd stageのstep数は`--steps` と`--strength`オプションの値から計算されます（`steps*strength`）。
+img2imgと併用できません。
+以下のオプションがあります。
+- `--highres_fix_scale`：Highres. fixを有効にして、1st stageで生成する画像のサイズを、倍率で指定します。最終出力が1024x1024で、最初に512x512の画像を生成する場合は`--highres_fix_scale 0.5`のように指定します。Web UI出の指定の逆数になっていますのでご注意ください。
+- `--highres_fix_steps`：1st stageの画像のステップ数を指定します。デフォルトは`28`です。
+- `--highres_fix_save_1st`：1st stageの画像を保存するかどうかを指定します。
+- `--highres_fix_latents_upscaling`：指定すると2nd stageの画像生成時に1st stageの画像をlatentベースでupscalingします（bilinearのみ対応）。未指定時は画像をLANCZOS4でupscalingします。
+- `--highres_fix_upscaler`：2nd stageに任意のupscalerを利用します。現在は`--highres_fix_upscaler tools.latent_upscaler` のみ対応しています。
+- `--highres_fix_upscaler_args`：`--highres_fix_upscaler`で指定したupscalerに渡す引数を指定します。
+    `tools.latent_upscaler`の場合は、`--highres_fix_upscaler_args "weights=D:\Work\SD\Models\others\etc\upscaler-v1-e100-220.safetensors"`のように重みファイルを指定します。
+コマンドラインの例です。
+```batchfile
+python gen_img_diffusers.py  --ckpt trinart_characters_it4_v1_vae_merged.ckpt
+    --n_iter 1 --scale 7.5 --W 1024 --H 1024 --batch_size 1 --outdir ../txt2img
+    --steps 48 --sampler ddim --fp16
+    --xformers
+    --images_per_prompt 1  --interactive
+    --highres_fix_scale 0.5 --highres_fix_steps 28 --strength 0.5
+```
+## ControlNet
+現在はControlNet 1.0のみ動作確認しています。プリプロセスはCannyのみサポートしています。
+以下のオプションがあります。
+- `--control_net_models`：ControlNetのモデルファイルを指定します。
+    複数指定すると、それらをstepごとに切り替えて利用します（Web UIのControlNet拡張の実装と異なります）。diffと通常の両方をサポートします。
+- `--guide_image_path`：ControlNetに使うヒント画像を指定します。`--img_path`と同様にフォルダを指定すると、そのフォルダの画像を順次利用します。Canny以外のモデルの場合には、あらかじめプリプロセスを行っておいてください。
+- `--control_net_preps`：ControlNetのプリプロセスを指定します。`--control_net_models`と同様に複数指定可能です。現在はcannyのみ対応しています。対象モデルでプリプロセスを使用しない場合は `none` を指定します。
+   cannyの場合 `--control_net_preps canny_63_191`のように、閾値1と2を'_'で区切って指定できます。
+- `--control_net_weights`：ControlNetの適用時の重みを指定します（`1.0`で通常、`0.5`なら半分の影響力で適用）。`--control_net_models`と同様に複数指定可能です。
+- `--control_net_ratios`：ControlNetを適用するstepの範囲を指定します。`0.5`の場合は、step数の半分までControlNetを適用します。`--control_net_models`と同様に複数指定可能です。
+コマンドラインの例です。
+```batchfile
+python gen_img_diffusers.py --ckpt model_ckpt --scale 8 --steps 48 --outdir txt2img --xformers
+    --W 512 --H 768 --bf16 --sampler k_euler_a
+    --control_net_models diff_control_sd15_canny.safetensors --control_net_weights 1.0
+    --guide_image_path guide.png --control_net_ratios 1.0 --interactive
+```
+## Attention Couple + Reginal LoRA
+プロンプトをいくつかの部分に分割し、それぞれのプロンプトを画像内のどの領域に適用するかを指定できる機能です。個別のオプションはありませんが、`mask_path`とプロンプトで指定します。
+まず、プロンプトで` AND `を利用して、複数部分を定義します。最初の3つに対して領域指定ができ、以降の部分は画像全体へ適用されます。ネガティブプロンプトは画像全体に適用されます。
+以下ではANDで3つの部分を定義しています。
+```
+shs 2girls, looking at viewer, smile AND bsb 2girls, looking back AND 2girls --n bad quality, worst quality
+```
+次にマスク画像を用意します。マスク画像はカラーの画像で、RGBの各チャネルがプロンプトのANDで区切られた部分に対応します。またあるチャネルの値がすべて0の場合、画像全体に適用されます。
+上記の例では、Rチャネルが`shs 2girls, looking at viewer, smile`、Gチャネルが`bsb 2girls, looking back`に、Bチャネルが`2girls`に対応します。次のようなマスク画像を使用すると、Bチャネルに指定がありませんので、`2girls`は画像全体に適用されます。
+![image](https://user-images.githubusercontent.com/52813779/235343061-b4dc9392-3dae-4831-8347-1e9ae5054251.png)
+マスク画像は`--mask_path`で指定します。現在は1枚のみ対応しています。指定した画像サイズに自動的にリサイズされ適用されます。
+ControlNetと組み合わせることも可能です（細かい位置指定にはControlNetとの組み合わせを推奨します）。
+LoRAを指定すると、`--network_weights`で指定した複数のLoRAがそれぞれANDの各部分に対応します。現在の制約として、LoRAの数はANDの部分の数と同じである必要があります。
+## CLIP Guided Stable Diffusion
+DiffusersのCommunity Examplesの[こちらのcustom pipeline](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#clip-guided-stable-diffusion)からソースをコピー、変更したものです。
+通常のプロンプトによる生成指定に加えて、追加でより大規模のCLIPでプロンプトのテキストの特徴量を取得し、生成中の画像の特徴量がそのテキストの特徴量に近づくよう、生成される画像をコントロールします（私のざっくりとした理解です）。大きめのCLIPを使いますのでVRAM使用量はかなり増加し（VRAM 8GBでは512*512でも厳しいかもしれません）、生成時間も掛かります。
+なお選択できるサンプラーはDDIM、PNDM、LMSのみとなります。
+`--clip_guidance_scale`オプションにどの程度、CLIPの特徴量を反映するかを数値で指定します。先のサンプルでは100になっていますので、そのあたりから始めて増減すると良いようです。
+デフォルトではプロンプトの先頭75トークン（重みづけの特殊文字を除く）がCLIPに渡されます。プロンプトの`--c`オプションで、通常のプロンプトではなく、CLIPに渡すテキストを別に指定できます（たとえばCLIPはDreamBoothのidentifier（識別子）や「1girl」などのモデル特有の単語は認識できないと思われますので、それらを省いたテキストが良いと思われます）。
+コマンドラインの例です。
+```batchfile
+python gen_img_diffusers.py  --ckpt v1-5-pruned-emaonly.ckpt --n_iter 1
+    --scale 2.5 --W 512 --H 512 --batch_size 1 --outdir ../txt2img --steps 36
+    --sampler ddim --fp16 --opt_channels_last --xformers --images_per_prompt 1
+    --interactive --clip_guidance_scale 100
+```
+## CLIP Image Guided Stable Diffusion
+テキストではなくCLIPに別の画像を渡し、その特徴量に近づくよう生成をコントロールする機能です。`--clip_image_guidance_scale`オプションで適用量の数値を、`--guide_image_path`オプションでguideに使用する画像（ファイルまたはフォルダ）を指定してください。
+コマンドラインの例です。
+```batchfile
+python gen_img_diffusers.py  --ckpt trinart_characters_it4_v1_vae_merged.ckpt
+    --n_iter 1 --scale 7.5 --W 512 --H 512 --batch_size 1 --outdir ../txt2img
+    --steps 80 --sampler ddim --fp16 --opt_channels_last --xformers
+    --images_per_prompt 1  --interactive  --clip_image_guidance_scale 100
+    --guide_image_path YUKA160113420I9A4104_TP_V.jpg
+```
+### VGG16 Guided Stable Diffusion
+指定した画像に近づくように画像生成する機能です。通常のプロンプトによる生成指定に加えて、追加でVGG16の特徴量を取得し、生成中の画像が指定したガイド画像に近づくよう、生成される画像をコントロールします。img2imgでの使用をお勧めします（通常の生成では画像がぼやけた感じになります）。CLIP Guided Stable Diffusionの仕組みを流用した独自の機能です。またアイデアはVGGを利用したスタイル変換から拝借しています。
+なお選択できるサンプラーはDDIM、PNDM、LMSのみとなります。
+`--vgg16_guidance_scale`オプションにどの程度、VGG16特徴量を反映するかを数値で指定します。試した感じでは100くらいから始めて増減すると良いようです。`--guide_image_path`オプションでguideに使用する画像（ファイルまたはフォルダ）を指定してください。
+複数枚の画像を一括でimg2img変換し、元画像をガイド画像とする場合、`--guide_image_path`と`--image_path`に同じ値を指定すればOKです。
+コマンドラインの例です。
+```batchfile
+python gen_img_diffusers.py --ckpt wd-v1-3-full-pruned-half.ckpt
+    --n_iter 1 --scale 5.5 --steps 60 --outdir ../txt2img
+    --xformers --sampler ddim --fp16 --W 512 --H 704
+    --batch_size 1 --images_per_prompt 1
+    --prompt "picturesque, 1girl, solo, anime face, skirt, beautiful face
+        --n lowres, bad anatomy, bad hands, error, missing fingers,
+        cropped, worst quality, low quality, normal quality,
+        jpeg artifacts, blurry, 3d, bad face, monochrome --d 1"
+    --strength 0.8 --image_path ..\src_image
+    --vgg16_guidance_scale 100 --guide_image_path ..\src_image
+```
+`--vgg16_guidance_layerPで特徴量取得に使用するVGG16のレイヤー番号を指定できます（デフォルトは20でconv4-2のReLUです）。上の層ほど画風を表現し、下の層ほどコンテンツを表現するといわれています。
+![image](https://user-images.githubusercontent.com/52813779/235343813-3c1f0d7a-4fb3-4274-98e4-b92d76b551df.png)
+# その他のオプション
+- `--no_preview` : 対話モードでプレビュー画像を表示しません。OpenCVがインストールされていない場合や、出力されたファイルを直接確認する場合に指定してください。
+- `--n_iter` : 生成を繰り返す回数を指定します。デフォルトは1です。プロンプトをファイルから読み込むとき、複数回の生成を行いたい場合に指定します。
+- `--tokenizer_cache_dir` : トークナイザーのキャッシュディレクトリを指定します。（作業中）
+- `--seed` : 乱数seedを指定します。1枚生成時はその画像のseed、複数枚生成時は各画像のseedを生成するための乱数のseedになります（`--from_file`で複数画像生成するとき、`--seed`オプションを指定すると複数回実行したときに各画像が同じseedになります）。
+- `--iter_same_seed` : プロンプトに乱数seedの指定がないとき、`--n_iter`の繰り返し内ではすべて同じseedを使います。`--from_file`で指定した複数のプロンプト間でseedを統一して比較するときに使います。
+- `--diffusers_xformers` : Diffuserのxformersを使用します。
+- `--opt_channels_last` : 推論時にテンソルのチャンネルを最後に配置します。場合によっては高速化されることがあります。
+- `--network_show_meta` : 追加ネットワークのメタデータを表示します。
+---
+# About Gradual Latent
+Gradual Latent is a Hires fix that gradually increases the size of the latent.  `gen_img.py`, `sdxl_gen_img.py`, and `gen_img_diffusers.py` have the following options.
+- `--gradual_latent_timesteps`: Specifies the timestep to start increasing the size of the latent. The default is None, which means Gradual Latent is not used. Please try around 750 at first.
+- `--gradual_latent_ratio`: Specifies the initial size of the latent. The default is 0.5, which means it starts with half the default latent size.
+- `--gradual_latent_ratio_step`: Specifies the ratio to increase the size of the latent. The default is 0.125, which means the latent size is gradually increased to 0.625, 0.75, 0.875, 1.0.
+- `--gradual_latent_ratio_every_n_steps`: Specifies the interval to increase the size of the latent. The default is 3, which means the latent size is increased every 3 steps.
+Each option can also be specified with prompt options, `--glt`, `--glr`, `--gls`, `--gle`.
+__Please specify `euler_a` for the sampler.__ Because the source code of the sampler is modified. It will not work with other samplers.
+It is more effective with SD 1.5. It is quite subtle with SDXL.
+# Gradual Latent について
+latentのサイズを徐々に大きくしていくHires fixです。`gen_img.py` 、``sdxl_gen_img.py` 、`gen_img_diffusers.py` に以下のオプションが追加されています。
+- `--gradual_latent_timesteps` : latentのサイズを大きくし始めるタイムステップを指定します。デフォルトは None で、Gradual Latentを使用しません。750 くらいから始めてみてください。
+- `--gradual_latent_ratio` : latentの初期サイズを指定します。デフォルトは 0.5 で、デフォルトの latent サイズの半分のサイズから始めます。
+- `--gradual_latent_ratio_step`: latentのサイズを大きくする割合を指定します。デフォルトは 0.125 で、latentのサイズを 0.625, 0.75, 0.875, 1.0 と徐々に大きくします。
+- `--gradual_latent_ratio_every_n_steps`: latentのサイズを大きくする間隔を指定しま��。デフォルトは 3 で、3ステップごとに latent のサイズを大きくします。
+それぞれのオプションは、プロンプトオプション、`--glt`、`--glr`、`--gls`、`--gle` でも指定できます。
+サンプラーに手を加えているため、__サンプラーに `euler_a` を指定してください。__ 他のサンプラーでは動作しません。
+SD 1.5 のほうが効果があります。SDXL ではかなり微妙です。

gradscaler.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from collections import defaultdict
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+import intel_extension_for_pytorch._C as core # pylint: disable=import-error, unused-import
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+device_supports_fp64 = torch.xpu.has_fp64_dtype() if hasattr(torch.xpu, "has_fp64_dtype") else torch.xpu.get_device_properties("xpu").has_fp64
+OptState = ipex.cpu.autocast._grad_scaler.OptState
+_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator
+_refresh_per_optimizer_state = ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state
+def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16): # pylint: disable=unused-argument
+    per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
+    per_device_found_inf = _MultiDeviceReplicator(found_inf)
+    # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+    # There could be hundreds of grads, so we'd like to iterate through them just once.
+    # However, we don't know their devices or dtypes in advance.
+    # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+    # Google says mypy struggles with defaultdicts type annotations.
+    per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
+    # sync grad to master weight
+    if hasattr(optimizer, "sync_grad"):
+        optimizer.sync_grad()
+    with torch.no_grad():
+        for group in optimizer.param_groups:
+            for param in group["params"]:
+                if param.grad is None:
+                    continue
+                if (not allow_fp16) and param.grad.dtype == torch.float16:
+                    raise ValueError("Attempting to unscale FP16 gradients.")
+                if param.grad.is_sparse:
+                    # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                    # coalesce() deduplicates indices and adds all values that have the same index.
+                    # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                    # so we should check the coalesced _values().
+                    if param.grad.dtype is torch.float16:
+                        param.grad = param.grad.coalesce()
+                    to_unscale = param.grad._values()
+                else:
+                    to_unscale = param.grad
+                # -: is there a way to split by device and dtype without appending in the inner loop?
+                to_unscale = to_unscale.to("cpu")
+                per_device_and_dtype_grads[to_unscale.device][
+                    to_unscale.dtype
+                ].append(to_unscale)
+        for _, per_dtype_grads in per_device_and_dtype_grads.items():
+            for grads in per_dtype_grads.values():
+                core._amp_foreach_non_finite_check_and_unscale_(
+                    grads,
+                    per_device_found_inf.get("cpu"),
+                    per_device_inv_scale.get("cpu"),
+                )
+    return per_device_found_inf._per_device_tensors
+def unscale_(self, optimizer):
+    """
+    Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+    :meth:`unscale_` is optional, serving cases where you need to
+    :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+    between the backward pass(es) and :meth:`step`.
+    If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+    Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+        ...
+        scaler.scale(loss).backward()
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+        scaler.step(optimizer)
+        scaler.update()
+    Args:
+        optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+    .. warning::
+        :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+        and only after all gradients for that optimizer's assigned parameters have been accumulated.
+        Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+    .. warning::
+        :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+    """
+    if not self._enabled:
+        return
+    self._check_scale_growth_tracker("unscale_")
+    optimizer_state = self._per_optimizer_states[id(optimizer)]
+    if optimizer_state["stage"] is OptState.UNSCALED: # pylint: disable=no-else-raise
+        raise RuntimeError(
+            "unscale_() has already been called on this optimizer since the last update()."
+        )
+    elif optimizer_state["stage"] is OptState.STEPPED:
+        raise RuntimeError("unscale_() is being called after step().")
+    # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+    assert self._scale is not None
+    if device_supports_fp64:
+        inv_scale = self._scale.double().reciprocal().float()
+    else:
+        inv_scale = self._scale.to("cpu").double().reciprocal().float().to(self._scale.device)
+    found_inf = torch.full(
+        (1,), 0.0, dtype=torch.float32, device=self._scale.device
+    )
+    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+        optimizer, inv_scale, found_inf, False
+    )
+    optimizer_state["stage"] = OptState.UNSCALED
+def update(self, new_scale=None):
+    """
+    Updates the scale factor.
+    If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+    to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+    the scale is multiplied by ``growth_factor`` to increase it.
+    Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+    used directly, it's used to fill GradScaler's internal scale tensor. So if
+    ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+    affect the scale GradScaler uses internally.)
+    Args:
+        new_scale (float or :class:`torch.FloatTensor`, optional, default=None):  New scale factor.
+    .. warning::
+        :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+        been invoked for all optimizers used this iteration.
+    """
+    if not self._enabled:
+        return
+    _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+    if new_scale is not None:
+        # Accept a new user-defined scale.
+        if isinstance(new_scale, float):
+            self._scale.fill_(new_scale)  # type: ignore[union-attr]
+        else:
+            reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False."
+            assert isinstance(new_scale, torch.FloatTensor), reason  # type: ignore[attr-defined]
+            assert new_scale.numel() == 1, reason
+            assert new_scale.requires_grad is False, reason
+            self._scale.copy_(new_scale)  # type: ignore[union-attr]
+    else:
+        # Consume shared inf/nan data collected from optimizers to update the scale.
+        # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+        found_infs = [
+            found_inf.to(device="cpu", non_blocking=True)
+            for state in self._per_optimizer_states.values()
+            for found_inf in state["found_inf_per_device"].values()
+        ]
+        assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+        found_inf_combined = found_infs[0]
+        if len(found_infs) > 1:
+            for i in range(1, len(found_infs)):
+                found_inf_combined += found_infs[i]
+        to_device = _scale.device
+        _scale = _scale.to("cpu")
+        _growth_tracker = _growth_tracker.to("cpu")
+        core._amp_update_scale_(
+            _scale,
+            _growth_tracker,
+            found_inf_combined,
+            self._growth_factor,
+            self._backoff_factor,
+            self._growth_interval,
+        )
+        _scale = _scale.to(to_device)
+        _growth_tracker = _growth_tracker.to(to_device)
+    # To prepare for next iteration, clear the data collected from optimizers this iteration.
+    self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+def gradscaler_init():
+    torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
+    torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_
+    torch.xpu.amp.GradScaler.unscale_ = unscale_
+    torch.xpu.amp.GradScaler.update = update
+    return torch.xpu.amp.GradScaler

hijacks.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import os
+from functools import wraps
+from contextlib import nullcontext
+import torch
+import numpy as np
+device_supports_fp64 = torch.xpu.has_fp64_dtype() if hasattr(torch.xpu, "has_fp64_dtype") else torch.xpu.get_device_properties("xpu").has_fp64
+if os.environ.get('IPEX_FORCE_ATTENTION_SLICE', '0') == '0' and (torch.xpu.get_device_properties("xpu").total_memory / 1024 / 1024 / 1024) > 4.1:
+    try:
+        x = torch.ones((33000,33000), dtype=torch.float32, device="xpu")
+        del x
+        torch.xpu.empty_cache()
+        can_allocate_plus_4gb = True
+    except Exception:
+        can_allocate_plus_4gb = False
+else:
+    can_allocate_plus_4gb = bool(os.environ.get('IPEX_FORCE_ATTENTION_SLICE', '0') == '-1')
+# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return
+class DummyDataParallel(torch.nn.Module): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods
+    def __new__(cls, module, device_ids=None, output_device=None, dim=0): # pylint: disable=unused-argument
+        if isinstance(device_ids, list) and len(device_ids) > 1:
+            print("IPEX backend doesn't support DataParallel on multiple XPU devices")
+        return module.to("xpu")
+def return_null_context(*args, **kwargs): # pylint: disable=unused-argument
+    return nullcontext()
+@property
+def is_cuda(self):
+    return self.device.type == 'xpu' or self.device.type == 'cuda'
+def check_device(device):
+    return bool((isinstance(device, torch.device) and device.type == "cuda") or (isinstance(device, str) and "cuda" in device) or isinstance(device, int))
+def return_xpu(device):
+    return f"xpu:{device.split(':')[-1]}" if isinstance(device, str) and ":" in device else f"xpu:{device}" if isinstance(device, int) else torch.device(f"xpu:{device.index}" if device.index is not None else "xpu") if isinstance(device, torch.device) else "xpu"
+# Autocast
+original_autocast_init = torch.amp.autocast_mode.autocast.__init__
+@wraps(torch.amp.autocast_mode.autocast.__init__)
+def autocast_init(self, device_type, dtype=None, enabled=True, cache_enabled=None):
+    if device_type == "cuda":
+        return original_autocast_init(self, device_type="xpu", dtype=dtype, enabled=enabled, cache_enabled=cache_enabled)
+    else:
+        return original_autocast_init(self, device_type=device_type, dtype=dtype, enabled=enabled, cache_enabled=cache_enabled)
+# Latent Antialias CPU Offload:
+original_interpolate = torch.nn.functional.interpolate
+@wraps(torch.nn.functional.interpolate)
+def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
+    if mode in {'bicubic', 'bilinear'}:
+        return_device = tensor.device
+        return_dtype = tensor.dtype
+        return original_interpolate(tensor.to("cpu", dtype=torch.float32), size=size, scale_factor=scale_factor, mode=mode,
+        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias).to(return_device, dtype=return_dtype)
+    else:
+        return original_interpolate(tensor, size=size, scale_factor=scale_factor, mode=mode,
+        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias)
+# Diffusers Float64 (Alchemist GPUs doesn't support 64 bit):
+original_from_numpy = torch.from_numpy
+@wraps(torch.from_numpy)
+def from_numpy(ndarray):
+    if ndarray.dtype == float:
+        return original_from_numpy(ndarray.astype('float32'))
+    else:
+        return original_from_numpy(ndarray)
+original_as_tensor = torch.as_tensor
+@wraps(torch.as_tensor)
+def as_tensor(data, dtype=None, device=None):
+    if check_device(device):
+        device = return_xpu(device)
+    if isinstance(data, np.ndarray) and data.dtype == float and not (
+        (isinstance(device, torch.device) and device.type == "cpu") or (isinstance(device, str) and "cpu" in device)):
+        return original_as_tensor(data, dtype=torch.float32, device=device)
+    else:
+        return original_as_tensor(data, dtype=dtype, device=device)
+if can_allocate_plus_4gb:
+    original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
+else:
+    # 32 bit attention workarounds for Alchemist:
+    try:
+        from .attention import dynamic_scaled_dot_product_attention as original_scaled_dot_product_attention
+    except Exception: # pylint: disable=broad-exception-caught
+        original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
+@wraps(torch.nn.functional.scaled_dot_product_attention)
+def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, **kwargs):
+    if query.dtype != key.dtype:
+        key = key.to(dtype=query.dtype)
+    if query.dtype != value.dtype:
+        value = value.to(dtype=query.dtype)
+    if attn_mask is not None and query.dtype != attn_mask.dtype:
+        attn_mask = attn_mask.to(dtype=query.dtype)
+    return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal, **kwargs)
+# Data Type Errors:
+original_torch_bmm = torch.bmm
+@wraps(torch.bmm)
+def torch_bmm(input, mat2, *, out=None):
+    if input.dtype != mat2.dtype:
+        mat2 = mat2.to(input.dtype)
+    return original_torch_bmm(input, mat2, out=out)
+# Diffusers FreeU
+original_fft_fftn = torch.fft.fftn
+@wraps(torch.fft.fftn)
+def fft_fftn(input, s=None, dim=None, norm=None, *, out=None):
+    return_dtype = input.dtype
+    return original_fft_fftn(input.to(dtype=torch.float32), s=s, dim=dim, norm=norm, out=out).to(dtype=return_dtype)
+# Diffusers FreeU
+original_fft_ifftn = torch.fft.ifftn
+@wraps(torch.fft.ifftn)
+def fft_ifftn(input, s=None, dim=None, norm=None, *, out=None):
+    return_dtype = input.dtype
+    return original_fft_ifftn(input.to(dtype=torch.float32), s=s, dim=dim, norm=norm, out=out).to(dtype=return_dtype)
+# A1111 FP16
+original_functional_group_norm = torch.nn.functional.group_norm
+@wraps(torch.nn.functional.group_norm)
+def functional_group_norm(input, num_groups, weight=None, bias=None, eps=1e-05):
+    if weight is not None and input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and weight is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_group_norm(input, num_groups, weight=weight, bias=bias, eps=eps)
+# A1111 BF16
+original_functional_layer_norm = torch.nn.functional.layer_norm
+@wraps(torch.nn.functional.layer_norm)
+def functional_layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-05):
+    if weight is not None and input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and weight is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_layer_norm(input, normalized_shape, weight=weight, bias=bias, eps=eps)
+# Training
+original_functional_linear = torch.nn.functional.linear
+@wraps(torch.nn.functional.linear)
+def functional_linear(input, weight, bias=None):
+    if input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_linear(input, weight, bias=bias)
+original_functional_conv1d = torch.nn.functional.conv1d
+@wraps(torch.nn.functional.conv1d)
+def functional_conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    if input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_conv1d(input, weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+original_functional_conv2d = torch.nn.functional.conv2d
+@wraps(torch.nn.functional.conv2d)
+def functional_conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    if input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_conv2d(input, weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+# LTX Video
+original_functional_conv3d = torch.nn.functional.conv3d
+@wraps(torch.nn.functional.conv3d)
+def functional_conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    if input.dtype != weight.data.dtype:
+        input = input.to(dtype=weight.data.dtype)
+    if bias is not None and bias.data.dtype != weight.data.dtype:
+        bias.data = bias.data.to(dtype=weight.data.dtype)
+    return original_functional_conv3d(input, weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+# SwinIR BF16:
+original_functional_pad = torch.nn.functional.pad
+@wraps(torch.nn.functional.pad)
+def functional_pad(input, pad, mode='constant', value=None):
+    if mode == 'reflect' and input.dtype == torch.bfloat16:
+        return original_functional_pad(input.to(torch.float32), pad, mode=mode, value=value).to(dtype=torch.bfloat16)
+    else:
+        return original_functional_pad(input, pad, mode=mode, value=value)
+original_torch_tensor = torch.tensor
+@wraps(torch.tensor)
+def torch_tensor(data, *args, dtype=None, device=None, **kwargs):
+    global device_supports_fp64
+    if check_device(device):
+        device = return_xpu(device)
+    if not device_supports_fp64:
+        if (isinstance(device, torch.device) and device.type == "xpu") or (isinstance(device, str) and "xpu" in device):
+            if dtype == torch.float64:
+                dtype = torch.float32
+            elif dtype is None and (hasattr(data, "dtype") and (data.dtype == torch.float64 or data.dtype == float)):
+                dtype = torch.float32
+    return original_torch_tensor(data, *args, dtype=dtype, device=device, **kwargs)
+original_Tensor_to = torch.Tensor.to
+@wraps(torch.Tensor.to)
+def Tensor_to(self, device=None, *args, **kwargs):
+    if check_device(device):
+        return original_Tensor_to(self, return_xpu(device), *args, **kwargs)
+    else:
+        return original_Tensor_to(self, device, *args, **kwargs)
+original_Tensor_cuda = torch.Tensor.cuda
+@wraps(torch.Tensor.cuda)
+def Tensor_cuda(self, device=None, *args, **kwargs):
+    if check_device(device):
+        return original_Tensor_cuda(self, return_xpu(device), *args, **kwargs)
+    else:
+        return original_Tensor_cuda(self, device, *args, **kwargs)
+original_Tensor_pin_memory = torch.Tensor.pin_memory
+@wraps(torch.Tensor.pin_memory)
+def Tensor_pin_memory(self, device=None, *args, **kwargs):
+    if device is None:
+        device = "xpu"
+    if check_device(device):
+        return original_Tensor_pin_memory(self, return_xpu(device), *args, **kwargs)
+    else:
+        return original_Tensor_pin_memory(self, device, *args, **kwargs)
+original_UntypedStorage_init = torch.UntypedStorage.__init__
+@wraps(torch.UntypedStorage.__init__)
+def UntypedStorage_init(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_UntypedStorage_init(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_UntypedStorage_init(*args, device=device, **kwargs)
+original_UntypedStorage_cuda = torch.UntypedStorage.cuda
+@wraps(torch.UntypedStorage.cuda)
+def UntypedStorage_cuda(self, device=None, *args, **kwargs):
+    if check_device(device):
+        return original_UntypedStorage_cuda(self, return_xpu(device), *args, **kwargs)
+    else:
+        return original_UntypedStorage_cuda(self, device, *args, **kwargs)
+original_torch_empty = torch.empty
+@wraps(torch.empty)
+def torch_empty(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_torch_empty(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_empty(*args, device=device, **kwargs)
+original_torch_randn = torch.randn
+@wraps(torch.randn)
+def torch_randn(*args, device=None, dtype=None, **kwargs):
+    if dtype is bytes:
+        dtype = None
+    if check_device(device):
+        return original_torch_randn(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_randn(*args, device=device, **kwargs)
+original_torch_ones = torch.ones
+@wraps(torch.ones)
+def torch_ones(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_torch_ones(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_ones(*args, device=device, **kwargs)
+original_torch_zeros = torch.zeros
+@wraps(torch.zeros)
+def torch_zeros(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_torch_zeros(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_zeros(*args, device=device, **kwargs)
+original_torch_full = torch.full
+@wraps(torch.full)
+def torch_full(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_torch_full(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_full(*args, device=device, **kwargs)
+original_torch_linspace = torch.linspace
+@wraps(torch.linspace)
+def torch_linspace(*args, device=None, **kwargs):
+    if check_device(device):
+        return original_torch_linspace(*args, device=return_xpu(device), **kwargs)
+    else:
+        return original_torch_linspace(*args, device=device, **kwargs)
+original_torch_load = torch.load
+@wraps(torch.load)
+def torch_load(f, map_location=None, *args, **kwargs):
+    if map_location is None:
+        map_location = "xpu"
+    if check_device(map_location):
+        return original_torch_load(f, *args, map_location=return_xpu(map_location), **kwargs)
+    else:
+        return original_torch_load(f, *args, map_location=map_location, **kwargs)
+original_torch_Generator = torch.Generator
+@wraps(torch.Generator)
+def torch_Generator(device=None):
+    if check_device(device):
+        return original_torch_Generator(return_xpu(device))
+    else:
+        return original_torch_Generator(device)
+@wraps(torch.cuda.synchronize)
+def torch_cuda_synchronize(device=None):
+    if check_device(device):
+        return torch.xpu.synchronize(return_xpu(device))
+    else:
+        return torch.xpu.synchronize(device)
+# Hijack Functions:
+def ipex_hijacks(legacy=True):
+    global device_supports_fp64, can_allocate_plus_4gb
+    if legacy and float(torch.__version__[:3]) < 2.5:
+        torch.nn.functional.interpolate = interpolate
+    torch.tensor = torch_tensor
+    torch.Tensor.to = Tensor_to
+    torch.Tensor.cuda = Tensor_cuda
+    torch.Tensor.pin_memory = Tensor_pin_memory
+    torch.UntypedStorage.__init__ = UntypedStorage_init
+    torch.UntypedStorage.cuda = UntypedStorage_cuda
+    torch.empty = torch_empty
+    torch.randn = torch_randn
+    torch.ones = torch_ones
+    torch.zeros = torch_zeros
+    torch.full = torch_full
+    torch.linspace = torch_linspace
+    torch.load = torch_load
+    torch.Generator = torch_Generator
+    torch.cuda.synchronize = torch_cuda_synchronize
+    torch.backends.cuda.sdp_kernel = return_null_context
+    torch.nn.DataParallel = DummyDataParallel
+    torch.UntypedStorage.is_cuda = is_cuda
+    torch.amp.autocast_mode.autocast.__init__ = autocast_init
+    torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
+    torch.nn.functional.group_norm = functional_group_norm
+    torch.nn.functional.layer_norm = functional_layer_norm
+    torch.nn.functional.linear = functional_linear
+    torch.nn.functional.conv1d = functional_conv1d
+    torch.nn.functional.conv2d = functional_conv2d
+    torch.nn.functional.conv3d = functional_conv3d
+    torch.nn.functional.pad = functional_pad
+    torch.bmm = torch_bmm
+    torch.fft.fftn = fft_fftn
+    torch.fft.ifftn = fft_ifftn
+    if not device_supports_fp64:
+        torch.from_numpy = from_numpy
+        torch.as_tensor = as_tensor
+    return device_supports_fp64, can_allocate_plus_4gb

huggingface_util.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from typing import Union, BinaryIO
+from huggingface_hub import HfApi
+from pathlib import Path
+import argparse
+import os
+from library.utils import fire_in_thread
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def exists_repo(repo_id: str, repo_type: str, revision: str = "main", token: str = None):
+    api = HfApi(
+        token=token,
+    )
+    try:
+        api.repo_info(repo_id=repo_id, revision=revision, repo_type=repo_type)
+        return True
+    except:
+        return False
+def upload(
+    args: argparse.Namespace,
+    src: Union[str, Path, bytes, BinaryIO],
+    dest_suffix: str = "",
+    force_sync_upload: bool = False,
+):
+    repo_id = args.huggingface_repo_id
+    repo_type = args.huggingface_repo_type
+    token = args.huggingface_token
+    path_in_repo = args.huggingface_path_in_repo + dest_suffix if args.huggingface_path_in_repo is not None else None
+    private = args.huggingface_repo_visibility is None or args.huggingface_repo_visibility != "public"
+    api = HfApi(token=token)
+    if not exists_repo(repo_id=repo_id, repo_type=repo_type, token=token):
+        try:
+            api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private)
+        except Exception as e:  # とりあえずRepositoryNotFoundErrorは確認したが他にあると困るので
+            logger.error("===========================================")
+            logger.error(f"failed to create HuggingFace repo / HuggingFaceのリポジトリの作成に失敗しました : {e}")
+            logger.error("===========================================")
+    is_folder = (type(src) == str and os.path.isdir(src)) or (isinstance(src, Path) and src.is_dir())
+    def uploader():
+        try:
+            if is_folder:
+                api.upload_folder(
+                    repo_id=repo_id,
+                    repo_type=repo_type,
+                    folder_path=src,
+                    path_in_repo=path_in_repo,
+                )
+            else:
+                api.upload_file(
+                    repo_id=repo_id,
+                    repo_type=repo_type,
+                    path_or_fileobj=src,
+                    path_in_repo=path_in_repo,
+                )
+        except Exception as e:  # RuntimeErrorを確認済みだが他にあると困るので
+            logger.error("===========================================")
+            logger.error(f"failed to upload to HuggingFace / HuggingFaceへのアップロードに失敗しました : {e}")
+            logger.error("===========================================")
+    if args.async_upload and not force_sync_upload:
+        fire_in_thread(uploader)
+    else:
+        uploader()
+def list_dir(
+    repo_id: str,
+    subfolder: str,
+    repo_type: str,
+    revision: str = "main",
+    token: str = None,
+):
+    api = HfApi(
+        token=token,
+    )
+    repo_info = api.repo_info(repo_id=repo_id, revision=revision, repo_type=repo_type)
+    file_list = [file for file in repo_info.siblings if file.rfilename.startswith(subfolder)]
+    return file_list

hypernetwork.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import (
+    Attention,
+    AttnProcessor2_0,
+    SlicedAttnProcessor,
+    XFormersAttnProcessor
+)
+try:
+    import xformers.ops
+except:
+    xformers = None
+loaded_networks = []
+def apply_single_hypernetwork(
+    hypernetwork, hidden_states, encoder_hidden_states
+):
+    context_k, context_v = hypernetwork.forward(hidden_states, encoder_hidden_states)
+    return context_k, context_v
+def apply_hypernetworks(context_k, context_v, layer=None):
+    if len(loaded_networks) == 0:
+        return context_v, context_v
+    for hypernetwork in loaded_networks:
+        context_k, context_v = hypernetwork.forward(context_k, context_v)
+    context_k = context_k.to(dtype=context_k.dtype)
+    context_v = context_v.to(dtype=context_k.dtype)
+    return context_k, context_v
+def xformers_forward(
+    self: XFormersAttnProcessor,
+    attn: Attention,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    attention_mask: torch.Tensor = None,
+):
+    batch_size, sequence_length, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    attention_mask = attn.prepare_attention_mask(
+        attention_mask, sequence_length, batch_size
+    )
+    query = attn.to_q(hidden_states)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif attn.norm_cross:
+        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+    context_k, context_v = apply_hypernetworks(hidden_states, encoder_hidden_states)
+    key = attn.to_k(context_k)
+    value = attn.to_v(context_v)
+    query = attn.head_to_batch_dim(query).contiguous()
+    key = attn.head_to_batch_dim(key).contiguous()
+    value = attn.head_to_batch_dim(value).contiguous()
+    hidden_states = xformers.ops.memory_efficient_attention(
+        query,
+        key,
+        value,
+        attn_bias=attention_mask,
+        op=self.attention_op,
+        scale=attn.scale,
+    )
+    hidden_states = hidden_states.to(query.dtype)
+    hidden_states = attn.batch_to_head_dim(hidden_states)
+    # linear proj
+    hidden_states = attn.to_out[0](hidden_states)
+    # dropout
+    hidden_states = attn.to_out[1](hidden_states)
+    return hidden_states
+def sliced_attn_forward(
+    self: SlicedAttnProcessor,
+    attn: Attention,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    attention_mask: torch.Tensor = None,
+):
+    batch_size, sequence_length, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    attention_mask = attn.prepare_attention_mask(
+        attention_mask, sequence_length, batch_size
+    )
+    query = attn.to_q(hidden_states)
+    dim = query.shape[-1]
+    query = attn.head_to_batch_dim(query)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif attn.norm_cross:
+        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+    context_k, context_v = apply_hypernetworks(hidden_states, encoder_hidden_states)
+    key = attn.to_k(context_k)
+    value = attn.to_v(context_v)
+    key = attn.head_to_batch_dim(key)
+    value = attn.head_to_batch_dim(value)
+    batch_size_attention, query_tokens, _ = query.shape
+    hidden_states = torch.zeros(
+        (batch_size_attention, query_tokens, dim // attn.heads),
+        device=query.device,
+        dtype=query.dtype,
+    )
+    for i in range(batch_size_attention // self.slice_size):
+        start_idx = i * self.slice_size
+        end_idx = (i + 1) * self.slice_size
+        query_slice = query[start_idx:end_idx]
+        key_slice = key[start_idx:end_idx]
+        attn_mask_slice = (
+            attention_mask[start_idx:end_idx] if attention_mask is not None else None
+        )
+        attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+        attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+        hidden_states[start_idx:end_idx] = attn_slice
+    hidden_states = attn.batch_to_head_dim(hidden_states)
+    # linear proj
+    hidden_states = attn.to_out[0](hidden_states)
+    # dropout
+    hidden_states = attn.to_out[1](hidden_states)
+    return hidden_states
+def v2_0_forward(
+    self: AttnProcessor2_0,
+    attn: Attention,
+    hidden_states,
+    encoder_hidden_states=None,
+    attention_mask=None,
+):
+    batch_size, sequence_length, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    inner_dim = hidden_states.shape[-1]
+    if attention_mask is not None:
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        # scaled_dot_product_attention expects attention_mask shape to be
+        # (batch, heads, source_length, target_length)
+        attention_mask = attention_mask.view(
+            batch_size, attn.heads, -1, attention_mask.shape[-1]
+        )
+    query = attn.to_q(hidden_states)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif attn.norm_cross:
+        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+    context_k, context_v = apply_hypernetworks(hidden_states, encoder_hidden_states)
+    key = attn.to_k(context_k)
+    value = attn.to_v(context_v)
+    head_dim = inner_dim // attn.heads
+    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    # the output of sdp = (batch, num_heads, seq_len, head_dim)
+    # TODO: add support for attn.scale when we move to Torch 2.1
+    hidden_states = F.scaled_dot_product_attention(
+        query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+    )
+    hidden_states = hidden_states.transpose(1, 2).reshape(
+        batch_size, -1, attn.heads * head_dim
+    )
+    hidden_states = hidden_states.to(query.dtype)
+    # linear proj
+    hidden_states = attn.to_out[0](hidden_states)
+    # dropout
+    hidden_states = attn.to_out[1](hidden_states)
+    return hidden_states
+def replace_attentions_for_hypernetwork():
+    import diffusers.models.attention_processor
+    diffusers.models.attention_processor.XFormersAttnProcessor.__call__ = (
+        xformers_forward
+    )
+    diffusers.models.attention_processor.SlicedAttnProcessor.__call__ = (
+        sliced_attn_forward
+    )
+    diffusers.models.attention_processor.AttnProcessor2_0.__call__ = v2_0_forward

hypernetwork_nai.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# NAI compatible
+import torch
+class HypernetworkModule(torch.nn.Module):
+  def __init__(self, dim, multiplier=1.0):
+    super().__init__()
+    linear1 = torch.nn.Linear(dim, dim * 2)
+    linear2 = torch.nn.Linear(dim * 2, dim)
+    linear1.weight.data.normal_(mean=0.0, std=0.01)
+    linear1.bias.data.zero_()
+    linear2.weight.data.normal_(mean=0.0, std=0.01)
+    linear2.bias.data.zero_()
+    linears = [linear1, linear2]
+    self.linear = torch.nn.Sequential(*linears)
+    self.multiplier = multiplier
+  def forward(self, x):
+    return x + self.linear(x) * self.multiplier
+class Hypernetwork(torch.nn.Module):
+  enable_sizes = [320, 640, 768, 1280]
+  # return self.modules[Hypernetwork.enable_sizes.index(size)]
+  def __init__(self, multiplier=1.0) -> None:
+    super().__init__()
+    self.modules = []
+    for size in Hypernetwork.enable_sizes:
+      self.modules.append((HypernetworkModule(size, multiplier), HypernetworkModule(size, multiplier)))
+      self.register_module(f"{size}_0", self.modules[-1][0])
+      self.register_module(f"{size}_1", self.modules[-1][1])
+  def apply_to_stable_diffusion(self, text_encoder, vae, unet):
+    blocks = unet.input_blocks + [unet.middle_block] + unet.output_blocks
+    for block in blocks:
+      for subblk in block:
+        if 'SpatialTransformer' in str(type(subblk)):
+          for tf_block in subblk.transformer_blocks:
+            for attn in [tf_block.attn1, tf_block.attn2]:
+              size = attn.context_dim
+              if size in Hypernetwork.enable_sizes:
+                attn.hypernetwork = self
+              else:
+                attn.hypernetwork = None
+  def apply_to_diffusers(self, text_encoder, vae, unet):
+    blocks = unet.down_blocks + [unet.mid_block] + unet.up_blocks
+    for block in blocks:
+      if hasattr(block, 'attentions'):
+        for subblk in block.attentions:
+          if 'SpatialTransformer' in str(type(subblk)) or 'Transformer2DModel' in str(type(subblk)):      # 0.6.0 and 0.7~
+            for tf_block in subblk.transformer_blocks:
+              for attn in [tf_block.attn1, tf_block.attn2]:
+                size = attn.to_k.in_features
+                if size in Hypernetwork.enable_sizes:
+                  attn.hypernetwork = self
+                else:
+                  attn.hypernetwork = None
+    return True       # TODO error checking
+  def forward(self, x, context):
+    size = context.shape[-1]
+    assert size in Hypernetwork.enable_sizes
+    module = self.modules[Hypernetwork.enable_sizes.index(size)]
+    return module[0].forward(context), module[1].forward(context)
+  def load_from_state_dict(self, state_dict):
+    # old ver to new ver
+    changes = {
+        'linear1.bias': 'linear.0.bias',
+        'linear1.weight': 'linear.0.weight',
+        'linear2.bias': 'linear.1.bias',
+        'linear2.weight': 'linear.1.weight',
+    }
+    for key_from, key_to in changes.items():
+      if key_from in state_dict:
+        state_dict[key_to] = state_dict[key_from]
+        del state_dict[key_from]
+    for size, sd in state_dict.items():
+      if type(size) == int:
+        self.modules[Hypernetwork.enable_sizes.index(size)][0].load_state_dict(sd[0], strict=True)
+        self.modules[Hypernetwork.enable_sizes.index(size)][1].load_state_dict(sd[1], strict=True)
+    return True
+  def get_state_dict(self):
+    state_dict = {}
+    for i, size in enumerate(Hypernetwork.enable_sizes):
+      sd0 = self.modules[i][0].state_dict()
+      sd1 = self.modules[i][1].state_dict()
+      state_dict[size] = [sd0, sd1]
+    return state_dict

latent_upscaler.py ADDED Viewed

	@@ -0,0 +1,354 @@

+# 外部から簡単にupscalerを呼ぶためのスクリプト
+# 単体で動くようにモデル定義も含めている
+import argparse
+import glob
+import os
+import cv2
+from diffusers import AutoencoderKL
+from typing import Dict, List
+import numpy as np
+import torch
+from library.device_utils import init_ipex, get_preferred_device
+init_ipex()
+from torch import nn
+from tqdm import tqdm
+from PIL import Image
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels=None, kernel_size=3, stride=1, padding=1):
+        super(ResidualBlock, self).__init__()
+        if out_channels is None:
+            out_channels = in_channels
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size, stride, padding, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.relu2 = nn.ReLU(inplace=True)  # このReLUはresidualに足す前にかけるほうがいいかも
+        # initialize weights
+        self._initialize_weights()
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out += residual
+        out = self.relu2(out)
+        return out
+class Upscaler(nn.Module):
+    def __init__(self):
+        super(Upscaler, self).__init__()
+        # define layers
+        # latent has 4 channels
+        self.conv1 = nn.Conv2d(4, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(128)
+        self.relu1 = nn.ReLU(inplace=True)
+        # resblocks
+        # 数の暴力で20個：次元数を増やすよりもブロックを増やしたほうがreceptive fieldが広がるはずだぞ
+        self.resblock1 = ResidualBlock(128)
+        self.resblock2 = ResidualBlock(128)
+        self.resblock3 = ResidualBlock(128)
+        self.resblock4 = ResidualBlock(128)
+        self.resblock5 = ResidualBlock(128)
+        self.resblock6 = ResidualBlock(128)
+        self.resblock7 = ResidualBlock(128)
+        self.resblock8 = ResidualBlock(128)
+        self.resblock9 = ResidualBlock(128)
+        self.resblock10 = ResidualBlock(128)
+        self.resblock11 = ResidualBlock(128)
+        self.resblock12 = ResidualBlock(128)
+        self.resblock13 = ResidualBlock(128)
+        self.resblock14 = ResidualBlock(128)
+        self.resblock15 = ResidualBlock(128)
+        self.resblock16 = ResidualBlock(128)
+        self.resblock17 = ResidualBlock(128)
+        self.resblock18 = ResidualBlock(128)
+        self.resblock19 = ResidualBlock(128)
+        self.resblock20 = ResidualBlock(128)
+        # last convs
+        self.conv2 = nn.Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        self.bn2 = nn.BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+        self.bn3 = nn.BatchNorm2d(64)
+        self.relu3 = nn.ReLU(inplace=True)
+        # final conv: output 4 channels
+        self.conv_final = nn.Conv2d(64, 4, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
+        # initialize weights
+        self._initialize_weights()
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+        # initialize final conv weights to 0: 流行りのzero conv
+        nn.init.constant_(self.conv_final.weight, 0)
+    def forward(self, x):
+        inp = x
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        # いくつかのresblockを通した��に、residualを足すことで精度向上と学習速度向上が見込めるはず
+        residual = x
+        x = self.resblock1(x)
+        x = self.resblock2(x)
+        x = self.resblock3(x)
+        x = self.resblock4(x)
+        x = x + residual
+        residual = x
+        x = self.resblock5(x)
+        x = self.resblock6(x)
+        x = self.resblock7(x)
+        x = self.resblock8(x)
+        x = x + residual
+        residual = x
+        x = self.resblock9(x)
+        x = self.resblock10(x)
+        x = self.resblock11(x)
+        x = self.resblock12(x)
+        x = x + residual
+        residual = x
+        x = self.resblock13(x)
+        x = self.resblock14(x)
+        x = self.resblock15(x)
+        x = self.resblock16(x)
+        x = x + residual
+        residual = x
+        x = self.resblock17(x)
+        x = self.resblock18(x)
+        x = self.resblock19(x)
+        x = self.resblock20(x)
+        x = x + residual
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        x = self.bn3(x)
+        # ここにreluを入れないほうがいい気がする
+        x = self.conv_final(x)
+        # network estimates the difference between the input and the output
+        x = x + inp
+        return x
+    def support_latents(self) -> bool:
+        return False
+    def upscale(
+        self,
+        vae: AutoencoderKL,
+        lowreso_images: List[Image.Image],
+        lowreso_latents: torch.Tensor,
+        dtype: torch.dtype,
+        width: int,
+        height: int,
+        batch_size: int = 1,
+        vae_batch_size: int = 1,
+    ):
+        # assertion
+        assert lowreso_images is not None, "Upscaler requires lowreso image"
+        # make upsampled image with lanczos4
+        upsampled_images = []
+        for lowreso_image in lowreso_images:
+            upsampled_image = np.array(lowreso_image.resize((width, height), Image.LANCZOS))
+            upsampled_images.append(upsampled_image)
+        # convert to tensor: this tensor is too large to be converted to cuda
+        upsampled_images = [torch.from_numpy(upsampled_image).permute(2, 0, 1).float() for upsampled_image in upsampled_images]
+        upsampled_images = torch.stack(upsampled_images, dim=0)
+        upsampled_images = upsampled_images.to(dtype)
+        # normalize to [-1, 1]
+        upsampled_images = upsampled_images / 127.5 - 1.0
+        # convert upsample images to latents with batch size
+        # logger.info("Encoding upsampled (LANCZOS4) images...")
+        upsampled_latents = []
+        for i in tqdm(range(0, upsampled_images.shape[0], vae_batch_size)):
+            batch = upsampled_images[i : i + vae_batch_size].to(vae.device)
+            with torch.no_grad():
+                batch = vae.encode(batch).latent_dist.sample()
+            upsampled_latents.append(batch)
+        upsampled_latents = torch.cat(upsampled_latents, dim=0)
+        # upscale (refine) latents with this model with batch size
+        logger.info("Upscaling latents...")
+        upscaled_latents = []
+        for i in range(0, upsampled_latents.shape[0], batch_size):
+            with torch.no_grad():
+                upscaled_latents.append(self.forward(upsampled_latents[i : i + batch_size]))
+        upscaled_latents = torch.cat(upscaled_latents, dim=0)
+        return upscaled_latents * 0.18215
+# external interface: returns a model
+def create_upscaler(**kwargs):
+    weights = kwargs["weights"]
+    model = Upscaler()
+    logger.info(f"Loading weights from {weights}...")
+    if os.path.splitext(weights)[1] == ".safetensors":
+        from safetensors.torch import load_file
+        sd = load_file(weights)
+    else:
+        sd = torch.load(weights, map_location=torch.device("cpu"))
+    model.load_state_dict(sd)
+    return model
+# another interface: upscale images with a model for given images from command line
+def upscale_images(args: argparse.Namespace):
+    DEVICE = get_preferred_device()
+    us_dtype = torch.float16  # TODO: support fp32/bf16
+    os.makedirs(args.output_dir, exist_ok=True)
+    # load VAE with Diffusers
+    assert args.vae_path is not None, "VAE path is required"
+    logger.info(f"Loading VAE from {args.vae_path}...")
+    vae = AutoencoderKL.from_pretrained(args.vae_path, subfolder="vae")
+    vae.to(DEVICE, dtype=us_dtype)
+    # prepare model
+    logger.info("Preparing model...")
+    upscaler: Upscaler = create_upscaler(weights=args.weights)
+    # logger.info("Loading weights from", args.weights)
+    # upscaler.load_state_dict(torch.load(args.weights))
+    upscaler.eval()
+    upscaler.to(DEVICE, dtype=us_dtype)
+    # load images
+    image_paths = glob.glob(args.image_pattern)
+    images = []
+    for image_path in image_paths:
+        image = Image.open(image_path)
+        image = image.convert("RGB")
+        # make divisible by 8
+        width = image.width
+        height = image.height
+        if width % 8 != 0:
+            width = width - (width % 8)
+        if height % 8 != 0:
+            height = height - (height % 8)
+        if width != image.width or height != image.height:
+            image = image.crop((0, 0, width, height))
+        images.append(image)
+    # debug output
+    if args.debug:
+        for image, image_path in zip(images, image_paths):
+            image_debug = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)
+            basename = os.path.basename(image_path)
+            basename_wo_ext, ext = os.path.splitext(basename)
+            dest_file_name = os.path.join(args.output_dir, f"{basename_wo_ext}_lanczos4{ext}")
+            image_debug.save(dest_file_name)
+    # upscale
+    logger.info("Upscaling...")
+    upscaled_latents = upscaler.upscale(
+        vae, images, None, us_dtype, width * 2, height * 2, batch_size=args.batch_size, vae_batch_size=args.vae_batch_size
+    )
+    upscaled_latents /= 0.18215
+    # decode with batch
+    logger.info("Decoding...")
+    upscaled_images = []
+    for i in tqdm(range(0, upscaled_latents.shape[0], args.vae_batch_size)):
+        with torch.no_grad():
+            batch = vae.decode(upscaled_latents[i : i + args.vae_batch_size]).sample
+        batch = batch.to("cpu")
+        upscaled_images.append(batch)
+    upscaled_images = torch.cat(upscaled_images, dim=0)
+    # tensor to numpy
+    upscaled_images = upscaled_images.permute(0, 2, 3, 1).numpy()
+    upscaled_images = (upscaled_images + 1.0) * 127.5
+    upscaled_images = upscaled_images.clip(0, 255).astype(np.uint8)
+    upscaled_images = upscaled_images[..., ::-1]
+    # save images
+    for i, image in enumerate(upscaled_images):
+        basename = os.path.basename(image_paths[i])
+        basename_wo_ext, ext = os.path.splitext(basename)
+        dest_file_name = os.path.join(args.output_dir, f"{basename_wo_ext}_upscaled{ext}")
+        cv2.imwrite(dest_file_name, image)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vae_path", type=str, default=None, help="VAE path")
+    parser.add_argument("--weights", type=str, default=None, help="Weights path")
+    parser.add_argument("--image_pattern", type=str, default=None, help="Image pattern")
+    parser.add_argument("--output_dir", type=str, default=".", help="Output directory")
+    parser.add_argument("--batch_size", type=int, default=4, help="Batch size")
+    parser.add_argument("--vae_batch_size", type=int, default=1, help="VAE batch size")
+    parser.add_argument("--debug", action="store_true", help="Debug mode")
+    args = parser.parse_args()
+    upscale_images(args)

libbitsandbytes_cpu.dll ADDED Viewed

Binary file (76.3 kB). View file

libbitsandbytes_cuda116.dll ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88f7bd2916ca3effc43f88492f1e1b9088d13cb5be3b4a3a4aede6aa3bf8d412
+size 4724224

libbitsandbytes_cuda118.dll ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4dc34709b8dcb078cbcdd65e5684f116cb395644d12b9c9fb144af5455bb1c18
+size 14026752

logo_aihub.png ADDED Viewed

lora.py ADDED Viewed

	@@ -0,0 +1,1410 @@

+# LoRA network module
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+import math
+import os
+from typing import Dict, List, Optional, Tuple, Type, Union
+from diffusers import AutoencoderKL
+from transformers import CLIPTextModel
+import numpy as np
+import torch
+import re
+from library.utils import setup_logging
+from library.sdxl_original_unet import SdxlUNet2DConditionModel
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+RE_UPDOWN = re.compile(r"(up|down)_blocks_(\d+)_(resnets|upsamplers|downsamplers|attentions)_(\d+)_")
+class LoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        dropout=None,
+        rank_dropout=None,
+        module_dropout=None,
+    ):
+        """if alpha == 0 or None, alpha is rank (no scaling)."""
+        super().__init__()
+        self.lora_name = lora_name
+        if org_module.__class__.__name__ == "Conv2d":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+        # if limit_rank:
+        #   self.lora_dim = min(lora_dim, in_dim, out_dim)
+        #   if self.lora_dim != lora_dim:
+        #     logger.info(f"{lora_name} dim (rank) is changed to: {self.lora_dim}")
+        # else:
+        self.lora_dim = lora_dim
+        if org_module.__class__.__name__ == "Conv2d":
+            kernel_size = org_module.kernel_size
+            stride = org_module.stride
+            padding = org_module.padding
+            self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+            self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+        else:
+            self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
+            self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
+        # same as microsoft's
+        torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+        torch.nn.init.zeros_(self.lora_up.weight)
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+    def apply_to(self):
+        self.org_forward = self.org_module.forward
+        self.org_module.forward = self.forward
+        del self.org_module
+    def forward(self, x):
+        org_forwarded = self.org_forward(x)
+        # module dropout
+        if self.module_dropout is not None and self.training:
+            if torch.rand(1) < self.module_dropout:
+                return org_forwarded
+        lx = self.lora_down(x)
+        # normal dropout
+        if self.dropout is not None and self.training:
+            lx = torch.nn.functional.dropout(lx, p=self.dropout)
+        # rank dropout
+        if self.rank_dropout is not None and self.training:
+            mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
+            if len(lx.size()) == 3:
+                mask = mask.unsqueeze(1)  # for Text Encoder
+            elif len(lx.size()) == 4:
+                mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
+            lx = lx * mask
+            # scaling for rank dropout: treat as if the rank is changed
+            # maskから計算することも考えられるが、augmentation的な効果を期待してrank_dropoutを用いる
+            scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+        else:
+            scale = self.scale
+        lx = self.lora_up(lx)
+        return org_forwarded + lx * self.multiplier * scale
+class LoRAInfModule(LoRAModule):
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        **kwargs,
+    ):
+        # no dropout for inference
+        super().__init__(lora_name, org_module, multiplier, lora_dim, alpha)
+        self.org_module_ref = [org_module]  # 後から参照できるように
+        self.enabled = True
+        # check regional or not by lora_name
+        self.text_encoder = False
+        if lora_name.startswith("lora_te_"):
+            self.regional = False
+            self.use_sub_prompt = True
+            self.text_encoder = True
+        elif "attn2_to_k" in lora_name or "attn2_to_v" in lora_name:
+            self.regional = False
+            self.use_sub_prompt = True
+        elif "time_emb" in lora_name:
+            self.regional = False
+            self.use_sub_prompt = False
+        else:
+            self.regional = True
+            self.use_sub_prompt = False
+        self.network: LoRANetwork = None
+    def set_network(self, network):
+        self.network = network
+    # freezeしてマージする
+    def merge_to(self, sd, dtype, device):
+        # get up/down weight
+        up_weight = sd["lora_up.weight"].to(torch.float).to(device)
+        down_weight = sd["lora_down.weight"].to(torch.float).to(device)
+        # extract weight from org_module
+        org_sd = self.org_module.state_dict()
+        weight = org_sd["weight"].to(torch.float)
+        # merge weight
+        if len(weight.size()) == 2:
+            # linear
+            weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                weight
+                + self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+            weight = weight + self.multiplier * conved * self.scale
+        # set weight to org_module
+        org_sd["weight"] = weight.to(dtype)
+        self.org_module.load_state_dict(org_sd)
+    # 復元できるマージのため、このモジュールのweightを返す
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
+        # get up/down weight from module
+        up_weight = self.lora_up.weight.to(torch.float)
+        down_weight = self.lora_down.weight.to(torch.float)
+        # pre-calculated weight
+        if len(down_weight.size()) == 2:
+            # linear
+            weight = self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            weight = self.multiplier * conved * self.scale
+        return weight
+    def set_region(self, region):
+        self.region = region
+        self.region_mask = None
+    def default_forward(self, x):
+        # logger.info(f"default_forward {self.lora_name} {x.size()}")
+        return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+    def forward(self, x):
+        if not self.enabled:
+            return self.org_forward(x)
+        if self.network is None or self.network.sub_prompt_index is None:
+            return self.default_forward(x)
+        if not self.regional and not self.use_sub_prompt:
+            return self.default_forward(x)
+        if self.regional:
+            return self.regional_forward(x)
+        else:
+            return self.sub_prompt_forward(x)
+    def get_mask_for_x(self, x):
+        # calculate size from shape of x
+        if len(x.size()) == 4:
+            h, w = x.size()[2:4]
+            area = h * w
+        else:
+            area = x.size()[1]
+        mask = self.network.mask_dic.get(area, None)
+        if mask is None or len(x.size()) == 2:
+            # emb_layers in SDXL doesn't have mask
+            # if "emb" not in self.lora_name:
+            #     print(f"mask is None for resolution {self.lora_name}, {area}, {x.size()}")
+            mask_size = (1, x.size()[1]) if len(x.size()) == 2 else (1, *x.size()[1:-1], 1)
+            return torch.ones(mask_size, dtype=x.dtype, device=x.device) / self.network.num_sub_prompts
+        if len(x.size()) == 3:
+            mask = torch.reshape(mask, (1, -1, 1))
+        return mask
+    def regional_forward(self, x):
+        if "attn2_to_out" in self.lora_name:
+            return self.to_out_forward(x)
+        if self.network.mask_dic is None:  # sub_prompt_index >= 3
+            return self.default_forward(x)
+        # apply mask for LoRA result
+        lx = self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+        mask = self.get_mask_for_x(lx)
+        # print("regional", self.lora_name, self.network.sub_prompt_index, lx.size(), mask.size())
+        # if mask.ndim > lx.ndim:  # in some resolution, lx is 2d and mask is 3d (the reason is not checked)
+        #     mask = mask.squeeze(-1)
+        lx = lx * mask
+        x = self.org_forward(x)
+        x = x + lx
+        if "attn2_to_q" in self.lora_name and self.network.is_last_network:
+            x = self.postp_to_q(x)
+        return x
+    def postp_to_q(self, x):
+        # repeat x to num_sub_prompts
+        has_real_uncond = x.size()[0] // self.network.batch_size == 3
+        qc = self.network.batch_size  # uncond
+        qc += self.network.batch_size * self.network.num_sub_prompts  # cond
+        if has_real_uncond:
+            qc += self.network.batch_size  # real_uncond
+        query = torch.zeros((qc, x.size()[1], x.size()[2]), device=x.device, dtype=x.dtype)
+        query[: self.network.batch_size] = x[: self.network.batch_size]
+        for i in range(self.network.batch_size):
+            qi = self.network.batch_size + i * self.network.num_sub_prompts
+            query[qi : qi + self.network.num_sub_prompts] = x[self.network.batch_size + i]
+        if has_real_uncond:
+            query[-self.network.batch_size :] = x[-self.network.batch_size :]
+        # logger.info(f"postp_to_q {self.lora_name} {x.size()} {query.size()} {self.network.num_sub_prompts}")
+        return query
+    def sub_prompt_forward(self, x):
+        if x.size()[0] == self.network.batch_size:  # if uncond in text_encoder, do not apply LoRA
+            return self.org_forward(x)
+        emb_idx = self.network.sub_prompt_index
+        if not self.text_encoder:
+            emb_idx += self.network.batch_size
+        # apply sub prompt of X
+        lx = x[emb_idx :: self.network.num_sub_prompts]
+        lx = self.lora_up(self.lora_down(lx)) * self.multiplier * self.scale
+        # logger.info(f"sub_prompt_forward {self.lora_name} {x.size()} {lx.size()} {emb_idx}")
+        x = self.org_forward(x)
+        x[emb_idx :: self.network.num_sub_prompts] += lx
+        return x
+    def to_out_forward(self, x):
+        # logger.info(f"to_out_forward {self.lora_name} {x.size()} {self.network.is_last_network}")
+        if self.network.is_last_network:
+            masks = [None] * self.network.num_sub_prompts
+            self.network.shared[self.lora_name] = (None, masks)
+        else:
+            lx, masks = self.network.shared[self.lora_name]
+        # call own LoRA
+        x1 = x[self.network.batch_size + self.network.sub_prompt_index :: self.network.num_sub_prompts]
+        lx1 = self.lora_up(self.lora_down(x1)) * self.multiplier * self.scale
+        if self.network.is_last_network:
+            lx = torch.zeros(
+                (self.network.num_sub_prompts * self.network.batch_size, *lx1.size()[1:]), device=lx1.device, dtype=lx1.dtype
+            )
+            self.network.shared[self.lora_name] = (lx, masks)
+        # logger.info(f"to_out_forward {lx.size()} {lx1.size()} {self.network.sub_prompt_index} {self.network.num_sub_prompts}")
+        lx[self.network.sub_prompt_index :: self.network.num_sub_prompts] += lx1
+        masks[self.network.sub_prompt_index] = self.get_mask_for_x(lx1)
+        # if not last network, return x and masks
+        x = self.org_forward(x)
+        if not self.network.is_last_network:
+            return x
+        lx, masks = self.network.shared.pop(self.lora_name)
+        # if last network, combine separated x with mask weighted sum
+        has_real_uncond = x.size()[0] // self.network.batch_size == self.network.num_sub_prompts + 2
+        out = torch.zeros((self.network.batch_size * (3 if has_real_uncond else 2), *x.size()[1:]), device=x.device, dtype=x.dtype)
+        out[: self.network.batch_size] = x[: self.network.batch_size]  # uncond
+        if has_real_uncond:
+            out[-self.network.batch_size :] = x[-self.network.batch_size :]  # real_uncond
+        # logger.info(f"to_out_forward {self.lora_name} {self.network.sub_prompt_index} {self.network.num_sub_prompts}")
+        # if num_sub_prompts > num of LoRAs, fill with zero
+        for i in range(len(masks)):
+            if masks[i] is None:
+                masks[i] = torch.zeros_like(masks[0])
+        mask = torch.cat(masks)
+        mask_sum = torch.sum(mask, dim=0) + 1e-4
+        for i in range(self.network.batch_size):
+            # 1枚の画像ごとに処理する
+            lx1 = lx[i * self.network.num_sub_prompts : (i + 1) * self.network.num_sub_prompts]
+            lx1 = lx1 * mask
+            lx1 = torch.sum(lx1, dim=0)
+            xi = self.network.batch_size + i * self.network.num_sub_prompts
+            x1 = x[xi : xi + self.network.num_sub_prompts]
+            x1 = x1 * mask
+            x1 = torch.sum(x1, dim=0)
+            x1 = x1 / mask_sum
+            x1 = x1 + lx1
+            out[self.network.batch_size + i] = x1
+        # logger.info(f"to_out_forward {x.size()} {out.size()} {has_real_uncond}")
+        return out
+def parse_block_lr_kwargs(is_sdxl: bool, nw_kwargs: Dict) -> Optional[List[float]]:
+    down_lr_weight = nw_kwargs.get("down_lr_weight", None)
+    mid_lr_weight = nw_kwargs.get("mid_lr_weight", None)
+    up_lr_weight = nw_kwargs.get("up_lr_weight", None)
+    # 以上のいずれにも設定がない場合は無効としてNoneを返す
+    if down_lr_weight is None and mid_lr_weight is None and up_lr_weight is None:
+        return None
+    # extract learning rate weight for each block
+    if down_lr_weight is not None:
+        # if some parameters are not set, use zero
+        if "," in down_lr_weight:
+            down_lr_weight = [(float(s) if s else 0.0) for s in down_lr_weight.split(",")]
+    if mid_lr_weight is not None:
+        mid_lr_weight = [(float(s) if s else 0.0) for s in mid_lr_weight.split(",")]
+    if up_lr_weight is not None:
+        if "," in up_lr_weight:
+            up_lr_weight = [(float(s) if s else 0.0) for s in up_lr_weight.split(",")]
+    return get_block_lr_weight(
+        is_sdxl, down_lr_weight, mid_lr_weight, up_lr_weight, float(nw_kwargs.get("block_lr_zero_threshold", 0.0))
+    )
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: AutoencoderKL,
+    text_encoder: Union[CLIPTextModel, List[CLIPTextModel]],
+    unet,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    # if unet is an instance of SdxlUNet2DConditionModel or subclass, set is_sdxl to True
+    is_sdxl = unet is not None and issubclass(unet.__class__, SdxlUNet2DConditionModel)
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+    # extract dim/alpha for conv2d, and block dim
+    conv_dim = kwargs.get("conv_dim", None)
+    conv_alpha = kwargs.get("conv_alpha", None)
+    if conv_dim is not None:
+        conv_dim = int(conv_dim)
+        if conv_alpha is None:
+            conv_alpha = 1.0
+        else:
+            conv_alpha = float(conv_alpha)
+    # block dim/alpha/lr
+    block_dims = kwargs.get("block_dims", None)
+    block_lr_weight = parse_block_lr_kwargs(is_sdxl, kwargs)
+    # 以上のいずれかに指定があればblockごとのdim(rank)を有効にする
+    if block_dims is not None or block_lr_weight is not None:
+        block_alphas = kwargs.get("block_alphas", None)
+        conv_block_dims = kwargs.get("conv_block_dims", None)
+        conv_block_alphas = kwargs.get("conv_block_alphas", None)
+        block_dims, block_alphas, conv_block_dims, conv_block_alphas = get_block_dims_and_alphas(
+            is_sdxl, block_dims, block_alphas, network_dim, network_alpha, conv_block_dims, conv_block_alphas, conv_dim, conv_alpha
+        )
+        # remove block dim/alpha without learning rate
+        block_dims, block_alphas, conv_block_dims, conv_block_alphas = remove_block_dims_and_alphas(
+            is_sdxl, block_dims, block_alphas, conv_block_dims, conv_block_alphas, block_lr_weight
+        )
+    else:
+        block_alphas = None
+        conv_block_dims = None
+        conv_block_alphas = None
+    # rank/module dropout
+    rank_dropout = kwargs.get("rank_dropout", None)
+    if rank_dropout is not None:
+        rank_dropout = float(rank_dropout)
+    module_dropout = kwargs.get("module_dropout", None)
+    if module_dropout is not None:
+        module_dropout = float(module_dropout)
+    # すごく引数が多いな ( ^ω^)･･･
+    network = LoRANetwork(
+        text_encoder,
+        unet,
+        multiplier=multiplier,
+        lora_dim=network_dim,
+        alpha=network_alpha,
+        dropout=neuron_dropout,
+        rank_dropout=rank_dropout,
+        module_dropout=module_dropout,
+        conv_lora_dim=conv_dim,
+        conv_alpha=conv_alpha,
+        block_dims=block_dims,
+        block_alphas=block_alphas,
+        conv_block_dims=conv_block_dims,
+        conv_block_alphas=conv_block_alphas,
+        varbose=True,
+        is_sdxl=is_sdxl,
+    )
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+    if block_lr_weight is not None:
+        network.set_block_lr_weight(block_lr_weight)
+    return network
+# このメソッドは外部から呼び出される可能性を考慮しておく
+# network_dim, network_alpha にはデフォルト値が入っている。
+# block_dims, block_alphas は両方ともNoneまたは両方とも値が入っている
+# conv_dim, conv_alpha は両方ともNoneまたは両��とも値が入っている
+def get_block_dims_and_alphas(
+    is_sdxl, block_dims, block_alphas, network_dim, network_alpha, conv_block_dims, conv_block_alphas, conv_dim, conv_alpha
+):
+    if not is_sdxl:
+        num_total_blocks = LoRANetwork.NUM_OF_BLOCKS * 2 + LoRANetwork.NUM_OF_MID_BLOCKS
+    else:
+        # 1+9+3+9+1=23, no LoRA for emb_layers (0)
+        num_total_blocks = 1 + LoRANetwork.SDXL_NUM_OF_BLOCKS * 2 + LoRANetwork.SDXL_NUM_OF_MID_BLOCKS + 1
+    def parse_ints(s):
+        return [int(i) for i in s.split(",")]
+    def parse_floats(s):
+        return [float(i) for i in s.split(",")]
+    # block_dimsとblock_alphasをパースする。必ず値が入る
+    if block_dims is not None:
+        block_dims = parse_ints(block_dims)
+        assert len(block_dims) == num_total_blocks, (
+            f"block_dims must have {num_total_blocks} elements but {len(block_dims)} elements are given"
+            + f" / block_dimsは{num_total_blocks}個指定してください（指定された個数: {len(block_dims)}）"
+        )
+    else:
+        logger.warning(
+            f"block_dims is not specified. all dims are set to {network_dim} / block_dimsが指定されていません。すべてのdimは{network_dim}になります"
+        )
+        block_dims = [network_dim] * num_total_blocks
+    if block_alphas is not None:
+        block_alphas = parse_floats(block_alphas)
+        assert (
+            len(block_alphas) == num_total_blocks
+        ), f"block_alphas must have {num_total_blocks} elements / block_alphasは{num_total_blocks}個指定してください"
+    else:
+        logger.warning(
+            f"block_alphas is not specified. all alphas are set to {network_alpha} / block_alphasが指定されていません。すべてのalphaは{network_alpha}になります"
+        )
+        block_alphas = [network_alpha] * num_total_blocks
+    # conv_block_dimsとconv_block_alphasを、指定がある場合のみパースする。指定がなければconv_dimとconv_alphaを使う
+    if conv_block_dims is not None:
+        conv_block_dims = parse_ints(conv_block_dims)
+        assert (
+            len(conv_block_dims) == num_total_blocks
+        ), f"conv_block_dims must have {num_total_blocks} elements / conv_block_dimsは{num_total_blocks}個指定してください"
+        if conv_block_alphas is not None:
+            conv_block_alphas = parse_floats(conv_block_alphas)
+            assert (
+                len(conv_block_alphas) == num_total_blocks
+            ), f"conv_block_alphas must have {num_total_blocks} elements / conv_block_alphasは{num_total_blocks}個指定してください"
+        else:
+            if conv_alpha is None:
+                conv_alpha = 1.0
+            logger.warning(
+                f"conv_block_alphas is not specified. all alphas are set to {conv_alpha} / conv_block_alphasが指定されていません。すべてのalphaは{conv_alpha}になります"
+            )
+            conv_block_alphas = [conv_alpha] * num_total_blocks
+    else:
+        if conv_dim is not None:
+            logger.warning(
+                f"conv_dim/alpha for all blocks are set to {conv_dim} and {conv_alpha} / すべてのブロックのconv_dimとalphaは{conv_dim}および{conv_alpha}になります"
+            )
+            conv_block_dims = [conv_dim] * num_total_blocks
+            conv_block_alphas = [conv_alpha] * num_total_blocks
+        else:
+            conv_block_dims = None
+            conv_block_alphas = None
+    return block_dims, block_alphas, conv_block_dims, conv_block_alphas
+# 層別学習率用に層ごとの学習率に対する倍率を定義する、外部から呼び出せるようにclass外に出しておく
+# 戻り値は block ごとの倍率のリスト
+def get_block_lr_weight(
+    is_sdxl,
+    down_lr_weight: Union[str, List[float]],
+    mid_lr_weight: List[float],
+    up_lr_weight: Union[str, List[float]],
+    zero_threshold: float,
+) -> Optional[List[float]]:
+    # パラメータ未指定時は何もせず、今までと同じ動作とする
+    if up_lr_weight is None and mid_lr_weight is None and down_lr_weight is None:
+        return None
+    if not is_sdxl:
+        max_len_for_down_or_up = LoRANetwork.NUM_OF_BLOCKS
+        max_len_for_mid = LoRANetwork.NUM_OF_MID_BLOCKS
+    else:
+        max_len_for_down_or_up = LoRANetwork.SDXL_NUM_OF_BLOCKS
+        max_len_for_mid = LoRANetwork.SDXL_NUM_OF_MID_BLOCKS
+    def get_list(name_with_suffix) -> List[float]:
+        import math
+        tokens = name_with_suffix.split("+")
+        name = tokens[0]
+        base_lr = float(tokens[1]) if len(tokens) > 1 else 0.0
+        if name == "cosine":
+            return [
+                math.sin(math.pi * (i / (max_len_for_down_or_up - 1)) / 2) + base_lr
+                for i in reversed(range(max_len_for_down_or_up))
+            ]
+        elif name == "sine":
+            return [math.sin(math.pi * (i / (max_len_for_down_or_up - 1)) / 2) + base_lr for i in range(max_len_for_down_or_up)]
+        elif name == "linear":
+            return [i / (max_len_for_down_or_up - 1) + base_lr for i in range(max_len_for_down_or_up)]
+        elif name == "reverse_linear":
+            return [i / (max_len_for_down_or_up - 1) + base_lr for i in reversed(range(max_len_for_down_or_up))]
+        elif name == "zeros":
+            return [0.0 + base_lr] * max_len_for_down_or_up
+        else:
+            logger.error(
+                "Unknown lr_weight argument %s is used. Valid arguments:  / 不明なlr_weightの引数 %s が使われました。有効な引数:\n\tcosine, sine, linear, reverse_linear, zeros"
+                % (name)
+            )
+            return None
+    if type(down_lr_weight) == str:
+        down_lr_weight = get_list(down_lr_weight)
+    if type(up_lr_weight) == str:
+        up_lr_weight = get_list(up_lr_weight)
+    if (up_lr_weight != None and len(up_lr_weight) > max_len_for_down_or_up) or (
+        down_lr_weight != None and len(down_lr_weight) > max_len_for_down_or_up
+    ):
+        logger.warning("down_weight or up_weight is too long. Parameters after %d-th are ignored." % max_len_for_down_or_up)
+        logger.warning("down_weightもしくはup_weightが長すぎます。%d個目以降のパラメータは無視されます。" % max_len_for_down_or_up)
+        up_lr_weight = up_lr_weight[:max_len_for_down_or_up]
+        down_lr_weight = down_lr_weight[:max_len_for_down_or_up]
+    if mid_lr_weight != None and len(mid_lr_weight) > max_len_for_mid:
+        logger.warning("mid_weight is too long. Parameters after %d-th are ignored." % max_len_for_mid)
+        logger.warning("mid_weightが長すぎます。%d個目以降のパラメータは無視されます。" % max_len_for_mid)
+        mid_lr_weight = mid_lr_weight[:max_len_for_mid]
+    if (up_lr_weight != None and len(up_lr_weight) < max_len_for_down_or_up) or (
+        down_lr_weight != None and len(down_lr_weight) < max_len_for_down_or_up
+    ):
+        logger.warning("down_weight or up_weight is too short. Parameters after %d-th are filled with 1." % max_len_for_down_or_up)
+        logger.warning(
+            "down_weightもしくはup_weightが短すぎます。%d個目までの不足したパラメータは1で補われます。" % max_len_for_down_or_up
+        )
+        if down_lr_weight != None and len(down_lr_weight) < max_len_for_down_or_up:
+            down_lr_weight = down_lr_weight + [1.0] * (max_len_for_down_or_up - len(down_lr_weight))
+        if up_lr_weight != None and len(up_lr_weight) < max_len_for_down_or_up:
+            up_lr_weight = up_lr_weight + [1.0] * (max_len_for_down_or_up - len(up_lr_weight))
+    if mid_lr_weight != None and len(mid_lr_weight) < max_len_for_mid:
+        logger.warning("mid_weight is too short. Parameters after %d-th are filled with 1." % max_len_for_mid)
+        logger.warning("mid_weightが短すぎます。%d個目までの不足したパラメータは1で補われます。" % max_len_for_mid)
+        mid_lr_weight = mid_lr_weight + [1.0] * (max_len_for_mid - len(mid_lr_weight))
+    if (up_lr_weight != None) or (mid_lr_weight != None) or (down_lr_weight != None):
+        logger.info("apply block learning rate / 階層別学習率を適用します。")
+        if down_lr_weight != None:
+            down_lr_weight = [w if w > zero_threshold else 0 for w in down_lr_weight]
+            logger.info(f"down_lr_weight (shallower -> deeper, 浅い層->深い層): {down_lr_weight}")
+        else:
+            down_lr_weight = [1.0] * max_len_for_down_or_up
+            logger.info("down_lr_weight: all 1.0, すべて1.0")
+        if mid_lr_weight != None:
+            mid_lr_weight = [w if w > zero_threshold else 0 for w in mid_lr_weight]
+            logger.info(f"mid_lr_weight: {mid_lr_weight}")
+        else:
+            mid_lr_weight = [1.0] * max_len_for_mid
+            logger.info("mid_lr_weight: all 1.0, すべて1.0")
+        if up_lr_weight != None:
+            up_lr_weight = [w if w > zero_threshold else 0 for w in up_lr_weight]
+            logger.info(f"up_lr_weight (deeper -> shallower, 深い層->浅い層): {up_lr_weight}")
+        else:
+            up_lr_weight = [1.0] * max_len_for_down_or_up
+            logger.info("up_lr_weight: all 1.0, すべて1.0")
+    lr_weight = down_lr_weight + mid_lr_weight + up_lr_weight
+    if is_sdxl:
+        lr_weight = [1.0] + lr_weight + [1.0]  # add 1.0 for emb_layers and out
+    assert (not is_sdxl and len(lr_weight) == LoRANetwork.NUM_OF_BLOCKS * 2 + LoRANetwork.NUM_OF_MID_BLOCKS) or (
+        is_sdxl and len(lr_weight) == 1 + LoRANetwork.SDXL_NUM_OF_BLOCKS * 2 + LoRANetwork.SDXL_NUM_OF_MID_BLOCKS + 1
+    ), f"lr_weight length is invalid: {len(lr_weight)}"
+    return lr_weight
+# lr_weightが0のblockをblock_dimsから除外する、外部から呼び出す可能性を考慮しておく
+def remove_block_dims_and_alphas(
+    is_sdxl, block_dims, block_alphas, conv_block_dims, conv_block_alphas, block_lr_weight: Optional[List[float]]
+):
+    if block_lr_weight is not None:
+        for i, lr in enumerate(block_lr_weight):
+            if lr == 0:
+                block_dims[i] = 0
+                if conv_block_dims is not None:
+                    conv_block_dims[i] = 0
+    return block_dims, block_alphas, conv_block_dims, conv_block_alphas
+# 外部から呼び出す可能性を考慮しておく
+def get_block_index(lora_name: str, is_sdxl: bool = False) -> int:
+    block_idx = -1  # invalid lora name
+    if not is_sdxl:
+        m = RE_UPDOWN.search(lora_name)
+        if m:
+            g = m.groups()
+            i = int(g[1])
+            j = int(g[3])
+            if g[2] == "resnets":
+                idx = 3 * i + j
+            elif g[2] == "attentions":
+                idx = 3 * i + j
+            elif g[2] == "upsamplers" or g[2] == "downsamplers":
+                idx = 3 * i + 2
+            if g[0] == "down":
+                block_idx = 1 + idx  # 0に該当するLoRAは存在しない
+            elif g[0] == "up":
+                block_idx = LoRANetwork.NUM_OF_BLOCKS + 1 + idx
+        elif "mid_block_" in lora_name:
+            block_idx = LoRANetwork.NUM_OF_BLOCKS  # idx=12
+    else:
+        # copy from sdxl_train
+        if lora_name.startswith("lora_unet_"):
+            name = lora_name[len("lora_unet_") :]
+            if name.startswith("time_embed_") or name.startswith("label_emb_"):  # No LoRA
+                block_idx = 0  # 0
+            elif name.startswith("input_blocks_"):  # 1-9
+                block_idx = 1 + int(name.split("_")[2])
+            elif name.startswith("middle_block_"):  # 10-12
+                block_idx = 10 + int(name.split("_")[2])
+            elif name.startswith("output_blocks_"):  # 13-21
+                block_idx = 13 + int(name.split("_")[2])
+            elif name.startswith("out_"):  # 22, out, no LoRA
+                block_idx = 22
+    return block_idx
+def convert_diffusers_to_sai_if_needed(weights_sd):
+    # only supports U-Net LoRA modules
+    found_up_down_blocks = False
+    for k in list(weights_sd.keys()):
+        if "down_blocks" in k:
+            found_up_down_blocks = True
+            break
+        if "up_blocks" in k:
+            found_up_down_blocks = True
+            break
+    if not found_up_down_blocks:
+        return
+    from library.sdxl_model_util import make_unet_conversion_map
+    unet_conversion_map = make_unet_conversion_map()
+    unet_conversion_map = {hf.replace(".", "_")[:-1]: sd.replace(".", "_")[:-1] for sd, hf in unet_conversion_map}
+    # # add extra conversion
+    # unet_conversion_map["up_blocks_1_upsamplers_0"] = "lora_unet_output_blocks_2_2_conv"
+    logger.info(f"Converting LoRA keys from Diffusers to SAI")
+    lora_unet_prefix = "lora_unet_"
+    for k in list(weights_sd.keys()):
+        if not k.startswith(lora_unet_prefix):
+            continue
+        unet_module_name = k[len(lora_unet_prefix) :].split(".")[0]
+        # search for conversion: this is slow because the algorithm is O(n^2), but the number of keys is small
+        for hf_module_name, sd_module_name in unet_conversion_map.items():
+            if hf_module_name in unet_module_name:
+                new_key = (
+                    lora_unet_prefix
+                    + unet_module_name.replace(hf_module_name, sd_module_name)
+                    + k[len(lora_unet_prefix) + len(unet_module_name) :]
+                )
+                weights_sd[new_key] = weights_sd.pop(k)
+                found = True
+                break
+        if not found:
+            logger.warning(f"Key {k} is not found in unet_conversion_map")
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weights_sd=None, for_inference=False, **kwargs):
+    # if unet is an instance of SdxlUNet2DConditionModel or subclass, set is_sdxl to True
+    is_sdxl = unet is not None and issubclass(unet.__class__, SdxlUNet2DConditionModel)
+    if weights_sd is None:
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file, safe_open
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+    # if keys are Diffusers based, convert to SAI based
+    if is_sdxl:
+        convert_diffusers_to_sai_if_needed(weights_sd)
+    # get dim/alpha mapping
+    modules_dim = {}
+    modules_alpha = {}
+    for key, value in weights_sd.items():
+        if "." not in key:
+            continue
+        lora_name = key.split(".")[0]
+        if "alpha" in key:
+            modules_alpha[lora_name] = value
+        elif "lora_down" in key:
+            dim = value.size()[0]
+            modules_dim[lora_name] = dim
+            # logger.info(lora_name, value.size(), dim)
+    # support old LoRA without alpha
+    for key in modules_dim.keys():
+        if key not in modules_alpha:
+            modules_alpha[key] = modules_dim[key]
+    module_class = LoRAInfModule if for_inference else LoRAModule
+    network = LoRANetwork(
+        text_encoder,
+        unet,
+        multiplier=multiplier,
+        modules_dim=modules_dim,
+        modules_alpha=modules_alpha,
+        module_class=module_class,
+        is_sdxl=is_sdxl,
+    )
+    # block lr
+    block_lr_weight = parse_block_lr_kwargs(is_sdxl, kwargs)
+    if block_lr_weight is not None:
+        network.set_block_lr_weight(block_lr_weight)
+    return network, weights_sd
+class LoRANetwork(torch.nn.Module):
+    NUM_OF_BLOCKS = 12  # フルモデル相当でのup,downの層の数
+    NUM_OF_MID_BLOCKS = 1
+    SDXL_NUM_OF_BLOCKS = 9  # SDXLのモデルでのinput/outputの層の数 total=1(base) 9(input) + 3(mid) + 9(output) + 1(out) = 23
+    SDXL_NUM_OF_MID_BLOCKS = 3
+    UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel"]
+    UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP"]
+    LORA_PREFIX_UNET = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"
+    # SDXL: must starts with LORA_PREFIX_TEXT_ENCODER
+    LORA_PREFIX_TEXT_ENCODER1 = "lora_te1"
+    LORA_PREFIX_TEXT_ENCODER2 = "lora_te2"
+    def __init__(
+        self,
+        text_encoder: Union[List[CLIPTextModel], CLIPTextModel],
+        unet,
+        multiplier: float = 1.0,
+        lora_dim: int = 4,
+        alpha: float = 1,
+        dropout: Optional[float] = None,
+        rank_dropout: Optional[float] = None,
+        module_dropout: Optional[float] = None,
+        conv_lora_dim: Optional[int] = None,
+        conv_alpha: Optional[float] = None,
+        block_dims: Optional[List[int]] = None,
+        block_alphas: Optional[List[float]] = None,
+        conv_block_dims: Optional[List[int]] = None,
+        conv_block_alphas: Optional[List[float]] = None,
+        modules_dim: Optional[Dict[str, int]] = None,
+        modules_alpha: Optional[Dict[str, int]] = None,
+        module_class: Type[object] = LoRAModule,
+        varbose: Optional[bool] = False,
+        is_sdxl: Optional[bool] = False,
+    ) -> None:
+        """
+        LoRA network: すごく引数が多いが、パターンは以下の通り
+        1. lora_dimとalphaを指定
+        2. lora_dim、alpha、conv_lora_dim、conv_alphaを指定
+        3. block_dimsとblock_alphasを指定 :  Conv2d3x3には適用しない
+        4. block_dims、block_alphas、conv_block_dims、conv_block_alphasを指定 : Conv2d3x3にも適用する
+        5. modules_dimとmodules_alphaを指定 (推論用)
+        """
+        super().__init__()
+        self.multiplier = multiplier
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        self.conv_lora_dim = conv_lora_dim
+        self.conv_alpha = conv_alpha
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+        self.loraplus_lr_ratio = None
+        self.loraplus_unet_lr_ratio = None
+        self.loraplus_text_encoder_lr_ratio = None
+        if modules_dim is not None:
+            logger.info(f"create LoRA network from weights")
+        elif block_dims is not None:
+            logger.info(f"create LoRA network from block_dims")
+            logger.info(
+                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+            )
+            logger.info(f"block_dims: {block_dims}")
+            logger.info(f"block_alphas: {block_alphas}")
+            if conv_block_dims is not None:
+                logger.info(f"conv_block_dims: {conv_block_dims}")
+                logger.info(f"conv_block_alphas: {conv_block_alphas}")
+        else:
+            logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
+            logger.info(
+                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+            )
+            if self.conv_lora_dim is not None:
+                logger.info(
+                    f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
+                )
+        # create module instances
+        def create_modules(
+            is_unet: bool,
+            text_encoder_idx: Optional[int],  # None, 1, 2
+            root_module: torch.nn.Module,
+            target_replace_modules: List[torch.nn.Module],
+        ) -> List[LoRAModule]:
+            prefix = (
+                self.LORA_PREFIX_UNET
+                if is_unet
+                else (
+                    self.LORA_PREFIX_TEXT_ENCODER
+                    if text_encoder_idx is None
+                    else (self.LORA_PREFIX_TEXT_ENCODER1 if text_encoder_idx == 1 else self.LORA_PREFIX_TEXT_ENCODER2)
+                )
+            )
+            loras = []
+            skipped = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+                        if is_linear or is_conv2d:
+                            lora_name = prefix + "." + name + "." + child_name
+                            lora_name = lora_name.replace(".", "_")
+                            dim = None
+                            alpha = None
+                            if modules_dim is not None:
+                                # モジュール指定あり
+                                if lora_name in modules_dim:
+                                    dim = modules_dim[lora_name]
+                                    alpha = modules_alpha[lora_name]
+                            elif is_unet and block_dims is not None:
+                                # U-Netでblock_dims指定あり
+                                block_idx = get_block_index(lora_name, is_sdxl)
+                                if is_linear or is_conv2d_1x1:
+                                    dim = block_dims[block_idx]
+                                    alpha = block_alphas[block_idx]
+                                elif conv_block_dims is not None:
+                                    dim = conv_block_dims[block_idx]
+                                    alpha = conv_block_alphas[block_idx]
+                            else:
+                                # 通常、すべて対象とする
+                                if is_linear or is_conv2d_1x1:
+                                    dim = self.lora_dim
+                                    alpha = self.alpha
+                                elif self.conv_lora_dim is not None:
+                                    dim = self.conv_lora_dim
+                                    alpha = self.conv_alpha
+                            if dim is None or dim == 0:
+                                # skipした情報を出力
+                                if is_linear or is_conv2d_1x1 or (self.conv_lora_dim is not None or conv_block_dims is not None):
+                                    skipped.append(lora_name)
+                                continue
+                            lora = module_class(
+                                lora_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                                dropout=dropout,
+                                rank_dropout=rank_dropout,
+                                module_dropout=module_dropout,
+                            )
+                            loras.append(lora)
+            return loras, skipped
+        text_encoders = text_encoder if type(text_encoder) == list else [text_encoder]
+        # create LoRA for text encoder
+        # 毎回すべてのモジュールを作るのは無駄なので要検討
+        self.text_encoder_loras = []
+        skipped_te = []
+        for i, text_encoder in enumerate(text_encoders):
+            if len(text_encoders) > 1:
+                index = i + 1
+                logger.info(f"create LoRA for Text Encoder {index}:")
+            else:
+                index = None
+                logger.info(f"create LoRA for Text Encoder:")
+            text_encoder_loras, skipped = create_modules(False, index, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+            self.text_encoder_loras.extend(text_encoder_loras)
+            skipped_te += skipped
+        logger.info(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
+        # extend U-Net target modules if conv2d 3x3 is enabled, or load from weights
+        target_modules = LoRANetwork.UNET_TARGET_REPLACE_MODULE
+        if modules_dim is not None or self.conv_lora_dim is not None or conv_block_dims is not None:
+            target_modules += LoRANetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
+        self.unet_loras, skipped_un = create_modules(True, None, unet, target_modules)
+        logger.info(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
+        skipped = skipped_te + skipped_un
+        if varbose and len(skipped) > 0:
+            logger.warning(
+                f"because block_lr_weight is 0 or dim (rank) is 0, {len(skipped)} LoRA modules are skipped / block_lr_weightまたはdim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
+            )
+            for name in skipped:
+                logger.info(f"\t{name}")
+        self.block_lr_weight = None
+        self.block_lr = False
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+    def set_enabled(self, is_enabled):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.enabled = is_enabled
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+        info = self.load_state_dict(weights_sd, False)
+        return info
+    def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True):
+        if apply_text_encoder:
+            logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")
+        else:
+            self.text_encoder_loras = []
+        if apply_unet:
+            logger.info(f"enable LoRA for U-Net: {len(self.unet_loras)} modules")
+        else:
+            self.unet_loras = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+    # TODO refactor to common function with apply_to
+    def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
+        apply_text_encoder = apply_unet = False
+        for key in weights_sd.keys():
+            if key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER):
+                apply_text_encoder = True
+            elif key.startswith(LoRANetwork.LORA_PREFIX_UNET):
+                apply_unet = True
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            sd_for_lora = {}
+            for key in weights_sd.keys():
+                if key.startswith(lora.lora_name):
+                    sd_for_lora[key[len(lora.lora_name) + 1 :]] = weights_sd[key]
+            lora.merge_to(sd_for_lora, dtype, device)
+        logger.info(f"weights are merged")
+    # 層別学習率用に層ごとの学習率に対する倍率を定義する　引数の順番が逆だがとりあえず気にしない
+    def set_block_lr_weight(self, block_lr_weight: Optional[List[float]]):
+        self.block_lr = True
+        self.block_lr_weight = block_lr_weight
+    def get_lr_weight(self, block_idx: int) -> float:
+        if not self.block_lr or self.block_lr_weight is None:
+            return 1.0
+        return self.block_lr_weight[block_idx]
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
+        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
+        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
+        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
+    # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+        # TODO warn if optimizer is not compatible with LoRA+ (but it will cause error so we don't need to check it here?)
+        # if (
+        #     self.loraplus_lr_ratio is not None
+        #     or self.loraplus_text_encoder_lr_ratio is not None
+        #     or self.loraplus_unet_lr_ratio is not None
+        # ):
+        #     assert (
+        #         optimizer_type.lower() != "prodigy" and "dadapt" not in optimizer_type.lower()
+        #     ), "LoRA+ and Prodigy/DAdaptation is not supported / LoRA+とProdigy/DAdaptationの組み合わせはサポートされていません"
+        self.requires_grad_(True)
+        all_params = []
+        lr_descriptions = []
+        def assemble_params(loras, lr, ratio):
+            param_groups = {"lora": {}, "plus": {}}
+            for lora in loras:
+                for name, param in lora.named_parameters():
+                    if ratio is not None and "lora_up" in name:
+                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    else:
+                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+            params = []
+            descriptions = []
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+                if len(param_data["params"]) == 0:
+                    continue
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * ratio
+                    else:
+                        param_data["lr"] = lr
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                    logger.info("NO LR skipping!")
+                    continue
+                params.append(param_data)
+                descriptions.append("plus" if key == "plus" else "")
+            return params, descriptions
+        if self.text_encoder_loras:
+            params, descriptions = assemble_params(
+                self.text_encoder_loras,
+                text_encoder_lr if text_encoder_lr is not None else default_lr,
+                self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio,
+            )
+            all_params.extend(params)
+            lr_descriptions.extend(["textencoder" + (" " + d if d else "") for d in descriptions])
+        if self.unet_loras:
+            if self.block_lr:
+                is_sdxl = False
+                for lora in self.unet_loras:
+                    if "input_blocks" in lora.lora_name or "output_blocks" in lora.lora_name:
+                        is_sdxl = True
+                        break
+                # 学習率のグラフをblockごとにしたいので、blockごとにloraを分類
+                block_idx_to_lora = {}
+                for lora in self.unet_loras:
+                    idx = get_block_index(lora.lora_name, is_sdxl)
+                    if idx not in block_idx_to_lora:
+                        block_idx_to_lora[idx] = []
+                    block_idx_to_lora[idx].append(lora)
+                # blockごとにパラメータを設定する
+                for idx, block_loras in block_idx_to_lora.items():
+                    params, descriptions = assemble_params(
+                        block_loras,
+                        (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(idx),
+                        self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
+                    )
+                    all_params.extend(params)
+                    lr_descriptions.extend([f"unet_block{idx}" + (" " + d if d else "") for d in descriptions])
+            else:
+                params, descriptions = assemble_params(
+                    self.unet_loras,
+                    unet_lr if unet_lr is not None else default_lr,
+                    self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
+                )
+                all_params.extend(params)
+                lr_descriptions.extend(["unet" + (" " + d if d else "") for d in descriptions])
+        return all_params, lr_descriptions
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+    def get_trainable_params(self):
+        return self.parameters()
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+        state_dict = self.state_dict()
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from library import train_util
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+    # mask is a tensor with values from 0 to 1
+    def set_region(self, sub_prompt_index, is_last_network, mask):
+        if mask.max() == 0:
+            mask = torch.ones_like(mask)
+        self.mask = mask
+        self.sub_prompt_index = sub_prompt_index
+        self.is_last_network = is_last_network
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.set_network(self)
+    def set_current_generation(self, batch_size, num_sub_prompts, width, height, shared, ds_ratio=None):
+        self.batch_size = batch_size
+        self.num_sub_prompts = num_sub_prompts
+        self.current_size = (height, width)
+        self.shared = shared
+        # create masks
+        mask = self.mask
+        mask_dic = {}
+        mask = mask.unsqueeze(0).unsqueeze(1)  # b(1),c(1),h,w
+        ref_weight = self.text_encoder_loras[0].lora_down.weight if self.text_encoder_loras else self.unet_loras[0].lora_down.weight
+        dtype = ref_weight.dtype
+        device = ref_weight.device
+        def resize_add(mh, mw):
+            # logger.info(mh, mw, mh * mw)
+            m = torch.nn.functional.interpolate(mask, (mh, mw), mode="bilinear")  # doesn't work in bf16
+            m = m.to(device, dtype=dtype)
+            mask_dic[mh * mw] = m
+        h = height // 8
+        w = width // 8
+        for _ in range(4):
+            resize_add(h, w)
+            if h % 2 == 1 or w % 2 == 1:  # add extra shape if h/w is not divisible by 2
+                resize_add(h + h % 2, w + w % 2)
+            # deep shrink
+            if ds_ratio is not None:
+                hd = int(h * ds_ratio)
+                wd = int(w * ds_ratio)
+                resize_add(hd, wd)
+            h = (h + 1) // 2
+            w = (w + 1) // 2
+        self.mask_dic = mask_dic
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+    def restore_weights(self):
+        # 重みのリストアを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+    def pre_calculation(self):
+        # 事前計算を行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            sd = org_module.state_dict()
+            org_weight = sd["weight"]
+            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            sd["weight"] = org_weight + lora_weight
+            assert sd["weight"].shape == org_weight.shape
+            org_module.load_state_dict(sd)
+            org_module._lora_restored = False
+            lora.enabled = False
+    def apply_max_norm_regularization(self, max_norm_value, device):
+        downkeys = []
+        upkeys = []
+        alphakeys = []
+        norms = []
+        keys_scaled = 0
+        state_dict = self.state_dict()
+        for key in state_dict.keys():
+            if "lora_down" in key and "weight" in key:
+                downkeys.append(key)
+                upkeys.append(key.replace("lora_down", "lora_up"))
+                alphakeys.append(key.replace("lora_down.weight", "alpha"))
+        for i in range(len(downkeys)):
+            down = state_dict[downkeys[i]].to(device)
+            up = state_dict[upkeys[i]].to(device)
+            alpha = state_dict[alphakeys[i]].to(device)
+            dim = down.shape[0]
+            scale = alpha / dim
+            if up.shape[2:] == (1, 1) and down.shape[2:] == (1, 1):
+                updown = (up.squeeze(2).squeeze(2) @ down.squeeze(2).squeeze(2)).unsqueeze(2).unsqueeze(3)
+            elif up.shape[2:] == (3, 3) or down.shape[2:] == (3, 3):
+                updown = torch.nn.functional.conv2d(down.permute(1, 0, 2, 3), up).permute(1, 0, 2, 3)
+            else:
+                updown = up @ down
+            updown *= scale
+            norm = updown.norm().clamp(min=max_norm_value / 2)
+            desired = torch.clamp(norm, max=max_norm_value)
+            ratio = desired.cpu() / norm.cpu()
+            sqrt_ratio = ratio**0.5
+            if ratio != 1:
+                keys_scaled += 1
+                state_dict[upkeys[i]] *= sqrt_ratio
+                state_dict[downkeys[i]] *= sqrt_ratio
+            scalednorm = updown.norm() * ratio
+            norms.append(scalednorm.item())
+        return keys_scaled, sum(norms) / len(norms), max(norms)

lora_diffusers.py ADDED Viewed

	@@ -0,0 +1,616 @@

+# Diffusersで動くLoRA。このファイル単独で完結する。
+# LoRA module for Diffusers. This file works independently.
+import bisect
+import math
+import random
+from typing import Any, Dict, List, Mapping, Optional, Union
+from diffusers import UNet2DConditionModel
+import numpy as np
+from tqdm import tqdm
+from transformers import CLIPTextModel
+import torch
+from library.device_utils import init_ipex, get_preferred_device
+init_ipex()
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+def make_unet_conversion_map() -> Dict[str, str]:
+    unet_conversion_map_layer = []
+    for i in range(3):  # num_blocks is 3 in sdxl
+        # loop over downblocks/upblocks
+        for j in range(2):
+            # loop over resnets/attentions for downblocks
+            hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+            sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+            unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+            if i < 3:
+                # no attention layers in down_blocks.3
+                hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+                sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+                unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+        for j in range(3):
+            # loop over resnets/attentions for upblocks
+            hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+            sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+            unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+            # if i > 0: commentout for sdxl
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+        if i < 3:
+            # no downsample in down_blocks.3
+            hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+            sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+            unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+            # no upsample in up_blocks.3
+            hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+            sd_upsample_prefix = f"output_blocks.{3*i + 2}.{2}."  # change for sdxl
+            unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+    hf_mid_atn_prefix = "mid_block.attentions.0."
+    sd_mid_atn_prefix = "middle_block.1."
+    unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+    for j in range(2):
+        hf_mid_res_prefix = f"mid_block.resnets.{j}."
+        sd_mid_res_prefix = f"middle_block.{2*j}."
+        unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    unet_conversion_map_resnet = [
+        # (stable-diffusion, HF Diffusers)
+        ("in_layers.0.", "norm1."),
+        ("in_layers.2.", "conv1."),
+        ("out_layers.0.", "norm2."),
+        ("out_layers.3.", "conv2."),
+        ("emb_layers.1.", "time_emb_proj."),
+        ("skip_connection.", "conv_shortcut."),
+    ]
+    unet_conversion_map = []
+    for sd, hf in unet_conversion_map_layer:
+        if "resnets" in hf:
+            for sd_res, hf_res in unet_conversion_map_resnet:
+                unet_conversion_map.append((sd + sd_res, hf + hf_res))
+        else:
+            unet_conversion_map.append((sd, hf))
+    for j in range(2):
+        hf_time_embed_prefix = f"time_embedding.linear_{j+1}."
+        sd_time_embed_prefix = f"time_embed.{j*2}."
+        unet_conversion_map.append((sd_time_embed_prefix, hf_time_embed_prefix))
+    for j in range(2):
+        hf_label_embed_prefix = f"add_embedding.linear_{j+1}."
+        sd_label_embed_prefix = f"label_emb.0.{j*2}."
+        unet_conversion_map.append((sd_label_embed_prefix, hf_label_embed_prefix))
+    unet_conversion_map.append(("input_blocks.0.0.", "conv_in."))
+    unet_conversion_map.append(("out.0.", "conv_norm_out."))
+    unet_conversion_map.append(("out.2.", "conv_out."))
+    sd_hf_conversion_map = {sd.replace(".", "_")[:-1]: hf.replace(".", "_")[:-1] for sd, hf in unet_conversion_map}
+    return sd_hf_conversion_map
+UNET_CONVERSION_MAP = make_unet_conversion_map()
+class LoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+    ):
+        """if alpha == 0 or None, alpha is rank (no scaling)."""
+        super().__init__()
+        self.lora_name = lora_name
+        if org_module.__class__.__name__ == "Conv2d" or org_module.__class__.__name__ == "LoRACompatibleConv":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+        self.lora_dim = lora_dim
+        if org_module.__class__.__name__ == "Conv2d" or org_module.__class__.__name__ == "LoRACompatibleConv":
+            kernel_size = org_module.kernel_size
+            stride = org_module.stride
+            padding = org_module.padding
+            self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+            self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+        else:
+            self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
+            self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # 勾配計算に含めない / not included in gradient calculation
+        # same as microsoft's
+        torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+        torch.nn.init.zeros_(self.lora_up.weight)
+        self.multiplier = multiplier
+        self.org_module = [org_module]
+        self.enabled = True
+        self.network: LoRANetwork = None
+        self.org_forward = None
+    # override org_module's forward method
+    def apply_to(self, multiplier=None):
+        if multiplier is not None:
+            self.multiplier = multiplier
+        if self.org_forward is None:
+            self.org_forward = self.org_module[0].forward
+            self.org_module[0].forward = self.forward
+    # restore org_module's forward method
+    def unapply_to(self):
+        if self.org_forward is not None:
+            self.org_module[0].forward = self.org_forward
+    # forward with lora
+    # scale is used LoRACompatibleConv, but we ignore it because we have multiplier
+    def forward(self, x, scale=1.0):
+        if not self.enabled:
+            return self.org_forward(x)
+        return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+    def set_network(self, network):
+        self.network = network
+    # merge lora weight to org weight
+    def merge_to(self, multiplier=1.0):
+        # get lora weight
+        lora_weight = self.get_weight(multiplier)
+        # get org weight
+        org_sd = self.org_module[0].state_dict()
+        org_weight = org_sd["weight"]
+        weight = org_weight + lora_weight.to(org_weight.device, dtype=org_weight.dtype)
+        # set weight to org_module
+        org_sd["weight"] = weight
+        self.org_module[0].load_state_dict(org_sd)
+    # restore org weight from lora weight
+    def restore_from(self, multiplier=1.0):
+        # get lora weight
+        lora_weight = self.get_weight(multiplier)
+        # get org weight
+        org_sd = self.org_module[0].state_dict()
+        org_weight = org_sd["weight"]
+        weight = org_weight - lora_weight.to(org_weight.device, dtype=org_weight.dtype)
+        # set weight to org_module
+        org_sd["weight"] = weight
+        self.org_module[0].load_state_dict(org_sd)
+    # return lora weight
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
+        # get up/down weight from module
+        up_weight = self.lora_up.weight.to(torch.float)
+        down_weight = self.lora_down.weight.to(torch.float)
+        # pre-calculated weight
+        if len(down_weight.size()) == 2:
+            # linear
+            weight = self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            weight = self.multiplier * conved * self.scale
+        return weight
+# Create network from weights for inference, weights are not loaded here
+def create_network_from_weights(
+    text_encoder: Union[CLIPTextModel, List[CLIPTextModel]], unet: UNet2DConditionModel, weights_sd: Dict, multiplier: float = 1.0
+):
+    # get dim/alpha mapping
+    modules_dim = {}
+    modules_alpha = {}
+    for key, value in weights_sd.items():
+        if "." not in key:
+            continue
+        lora_name = key.split(".")[0]
+        if "alpha" in key:
+            modules_alpha[lora_name] = value
+        elif "lora_down" in key:
+            dim = value.size()[0]
+            modules_dim[lora_name] = dim
+            # logger.info(f"{lora_name} {value.size()} {dim}")
+    # support old LoRA without alpha
+    for key in modules_dim.keys():
+        if key not in modules_alpha:
+            modules_alpha[key] = modules_dim[key]
+    return LoRANetwork(text_encoder, unet, multiplier=multiplier, modules_dim=modules_dim, modules_alpha=modules_alpha)
+def merge_lora_weights(pipe, weights_sd: Dict, multiplier: float = 1.0):
+    text_encoders = [pipe.text_encoder, pipe.text_encoder_2] if hasattr(pipe, "text_encoder_2") else [pipe.text_encoder]
+    unet = pipe.unet
+    lora_network = create_network_from_weights(text_encoders, unet, weights_sd, multiplier=multiplier)
+    lora_network.load_state_dict(weights_sd)
+    lora_network.merge_to(multiplier=multiplier)
+# block weightや学習に対応しない簡易版 / simple version without block weight and training
+class LoRANetwork(torch.nn.Module):
+    UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel"]
+    UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP"]
+    LORA_PREFIX_UNET = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"
+    # SDXL: must starts with LORA_PREFIX_TEXT_ENCODER
+    LORA_PREFIX_TEXT_ENCODER1 = "lora_te1"
+    LORA_PREFIX_TEXT_ENCODER2 = "lora_te2"
+    def __init__(
+        self,
+        text_encoder: Union[List[CLIPTextModel], CLIPTextModel],
+        unet: UNet2DConditionModel,
+        multiplier: float = 1.0,
+        modules_dim: Optional[Dict[str, int]] = None,
+        modules_alpha: Optional[Dict[str, int]] = None,
+        varbose: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+        logger.info("create LoRA network from weights")
+        # convert SDXL Stability AI's U-Net modules to Diffusers
+        converted = self.convert_unet_modules(modules_dim, modules_alpha)
+        if converted:
+            logger.info(f"converted {converted} Stability AI's U-Net LoRA modules to Diffusers (SDXL)")
+        # create module instances
+        def create_modules(
+            is_unet: bool,
+            text_encoder_idx: Optional[int],  # None, 1, 2
+            root_module: torch.nn.Module,
+            target_replace_modules: List[torch.nn.Module],
+        ) -> List[LoRAModule]:
+            prefix = (
+                self.LORA_PREFIX_UNET
+                if is_unet
+                else (
+                    self.LORA_PREFIX_TEXT_ENCODER
+                    if text_encoder_idx is None
+                    else (self.LORA_PREFIX_TEXT_ENCODER1 if text_encoder_idx == 1 else self.LORA_PREFIX_TEXT_ENCODER2)
+                )
+            )
+            loras = []
+            skipped = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = (
+                            child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "LoRACompatibleLinear"
+                        )
+                        is_conv2d = (
+                            child_module.__class__.__name__ == "Conv2d" or child_module.__class__.__name__ == "LoRACompatibleConv"
+                        )
+                        if is_linear or is_conv2d:
+                            lora_name = prefix + "." + name + "." + child_name
+                            lora_name = lora_name.replace(".", "_")
+                            if lora_name not in modules_dim:
+                                # logger.info(f"skipped {lora_name} (not found in modules_dim)")
+                                skipped.append(lora_name)
+                                continue
+                            dim = modules_dim[lora_name]
+                            alpha = modules_alpha[lora_name]
+                            lora = LoRAModule(
+                                lora_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                            )
+                            loras.append(lora)
+            return loras, skipped
+        text_encoders = text_encoder if type(text_encoder) == list else [text_encoder]
+        # create LoRA for text encoder
+        # 毎回すべてのモジュールを作るのは無駄なので要検討 / it is wasteful to create all modules every time, need to consider
+        self.text_encoder_loras: List[LoRAModule] = []
+        skipped_te = []
+        for i, text_encoder in enumerate(text_encoders):
+            if len(text_encoders) > 1:
+                index = i + 1
+            else:
+                index = None
+            text_encoder_loras, skipped = create_modules(False, index, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+            self.text_encoder_loras.extend(text_encoder_loras)
+            skipped_te += skipped
+        logger.info(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
+        if len(skipped_te) > 0:
+            logger.warning(f"skipped {len(skipped_te)} modules because of missing weight for text encoder.")
+        # extend U-Net target modules to include Conv2d 3x3
+        target_modules = LoRANetwork.UNET_TARGET_REPLACE_MODULE + LoRANetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
+        self.unet_loras: List[LoRAModule]
+        self.unet_loras, skipped_un = create_modules(True, None, unet, target_modules)
+        logger.info(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
+        if len(skipped_un) > 0:
+            logger.warning(f"skipped {len(skipped_un)} modules because of missing weight for U-Net.")
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            names.add(lora.lora_name)
+        for lora_name in modules_dim.keys():
+            assert lora_name in names, f"{lora_name} is not found in created LoRA modules."
+        # make to work load_state_dict
+        for lora in self.text_encoder_loras + self.unet_loras:
+            self.add_module(lora.lora_name, lora)
+    # SDXL: convert SDXL Stability AI's U-Net modules to Diffusers
+    def convert_unet_modules(self, modules_dim, modules_alpha):
+        converted_count = 0
+        not_converted_count = 0
+        map_keys = list(UNET_CONVERSION_MAP.keys())
+        map_keys.sort()
+        for key in list(modules_dim.keys()):
+            if key.startswith(LoRANetwork.LORA_PREFIX_UNET + "_"):
+                search_key = key.replace(LoRANetwork.LORA_PREFIX_UNET + "_", "")
+                position = bisect.bisect_right(map_keys, search_key)
+                map_key = map_keys[position - 1]
+                if search_key.startswith(map_key):
+                    new_key = key.replace(map_key, UNET_CONVERSION_MAP[map_key])
+                    modules_dim[new_key] = modules_dim[key]
+                    modules_alpha[new_key] = modules_alpha[key]
+                    del modules_dim[key]
+                    del modules_alpha[key]
+                    converted_count += 1
+                else:
+                    not_converted_count += 1
+        assert (
+            converted_count == 0 or not_converted_count == 0
+        ), f"some modules are not converted: {converted_count} converted, {not_converted_count} not converted"
+        return converted_count
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+    def apply_to(self, multiplier=1.0, apply_text_encoder=True, apply_unet=True):
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+            for lora in self.text_encoder_loras:
+                lora.apply_to(multiplier)
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+            for lora in self.unet_loras:
+                lora.apply_to(multiplier)
+    def unapply_to(self):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.unapply_to()
+    def merge_to(self, multiplier=1.0):
+        logger.info("merge LoRA weights to original weights")
+        for lora in tqdm(self.text_encoder_loras + self.unet_loras):
+            lora.merge_to(multiplier)
+        logger.info(f"weights are merged")
+    def restore_from(self, multiplier=1.0):
+        logger.info("restore LoRA weights from original weights")
+        for lora in tqdm(self.text_encoder_loras + self.unet_loras):
+            lora.restore_from(multiplier)
+        logger.info(f"weights are restored")
+    def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
+        # convert SDXL Stability AI's state dict to Diffusers' based state dict
+        map_keys = list(UNET_CONVERSION_MAP.keys())  # prefix of U-Net modules
+        map_keys.sort()
+        for key in list(state_dict.keys()):
+            if key.startswith(LoRANetwork.LORA_PREFIX_UNET + "_"):
+                search_key = key.replace(LoRANetwork.LORA_PREFIX_UNET + "_", "")
+                position = bisect.bisect_right(map_keys, search_key)
+                map_key = map_keys[position - 1]
+                if search_key.startswith(map_key):
+                    new_key = key.replace(map_key, UNET_CONVERSION_MAP[map_key])
+                    state_dict[new_key] = state_dict[key]
+                    del state_dict[key]
+        # in case of V2, some weights have different shape, so we need to convert them
+        # because V2 LoRA is based on U-Net created by use_linear_projection=False
+        my_state_dict = self.state_dict()
+        for key in state_dict.keys():
+            if state_dict[key].size() != my_state_dict[key].size():
+                # logger.info(f"convert {key} from {state_dict[key].size()} to {my_state_dict[key].size()}")
+                state_dict[key] = state_dict[key].view(my_state_dict[key].size())
+        return super().load_state_dict(state_dict, strict)
+if __name__ == "__main__":
+    # sample code to use LoRANetwork
+    import os
+    import argparse
+    from diffusers import StableDiffusionPipeline, StableDiffusionXLPipeline
+    import torch
+    device = get_preferred_device()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_id", type=str, default=None, help="model id for huggingface")
+    parser.add_argument("--lora_weights", type=str, default=None, help="path to LoRA weights")
+    parser.add_argument("--sdxl", action="store_true", help="use SDXL model")
+    parser.add_argument("--prompt", type=str, default="A photo of cat", help="prompt text")
+    parser.add_argument("--negative_prompt", type=str, default="", help="negative prompt text")
+    parser.add_argument("--seed", type=int, default=0, help="random seed")
+    args = parser.parse_args()
+    image_prefix = args.model_id.replace("/", "_") + "_"
+    # load Diffusers model
+    logger.info(f"load model from {args.model_id}")
+    pipe: Union[StableDiffusionPipeline, StableDiffusionXLPipeline]
+    if args.sdxl:
+        # use_safetensors=True does not work with 0.18.2
+        pipe = StableDiffusionXLPipeline.from_pretrained(args.model_id, variant="fp16", torch_dtype=torch.float16)
+    else:
+        pipe = StableDiffusionPipeline.from_pretrained(args.model_id, variant="fp16", torch_dtype=torch.float16)
+    pipe.to(device)
+    pipe.set_use_memory_efficient_attention_xformers(True)
+    text_encoders = [pipe.text_encoder, pipe.text_encoder_2] if args.sdxl else [pipe.text_encoder]
+    # load LoRA weights
+    logger.info(f"load LoRA weights from {args.lora_weights}")
+    if os.path.splitext(args.lora_weights)[1] == ".safetensors":
+        from safetensors.torch import load_file
+        lora_sd = load_file(args.lora_weights)
+    else:
+        lora_sd = torch.load(args.lora_weights)
+    # create by LoRA weights and load weights
+    logger.info(f"create LoRA network")
+    lora_network: LoRANetwork = create_network_from_weights(text_encoders, pipe.unet, lora_sd, multiplier=1.0)
+    logger.info(f"load LoRA network weights")
+    lora_network.load_state_dict(lora_sd)
+    lora_network.to(device, dtype=pipe.unet.dtype)  # required to apply_to. merge_to works without this
+    # 必要があれば、元のモデルの重みをバックアップしておく
+    # back-up unet/text encoder weights if necessary
+    def detach_and_move_to_cpu(state_dict):
+        for k, v in state_dict.items():
+            state_dict[k] = v.detach().cpu()
+        return state_dict
+    org_unet_sd = pipe.unet.state_dict()
+    detach_and_move_to_cpu(org_unet_sd)
+    org_text_encoder_sd = pipe.text_encoder.state_dict()
+    detach_and_move_to_cpu(org_text_encoder_sd)
+    if args.sdxl:
+        org_text_encoder_2_sd = pipe.text_encoder_2.state_dict()
+        detach_and_move_to_cpu(org_text_encoder_2_sd)
+    def seed_everything(seed):
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+    # create image with original weights
+    logger.info(f"create image with original weights")
+    seed_everything(args.seed)
+    image = pipe(args.prompt, negative_prompt=args.negative_prompt).images[0]
+    image.save(image_prefix + "original.png")
+    # apply LoRA network to the model: slower than merge_to, but can be reverted easily
+    logger.info(f"apply LoRA network to the model")
+    lora_network.apply_to(multiplier=1.0)
+    logger.info(f"create image with applied LoRA")
+    seed_everything(args.seed)
+    image = pipe(args.prompt, negative_prompt=args.negative_prompt).images[0]
+    image.save(image_prefix + "applied_lora.png")
+    # unapply LoRA network to the model
+    logger.info(f"unapply LoRA network to the model")
+    lora_network.unapply_to()
+    logger.info(f"create image with unapplied LoRA")
+    seed_everything(args.seed)
+    image = pipe(args.prompt, negative_prompt=args.negative_prompt).images[0]
+    image.save(image_prefix + "unapplied_lora.png")
+    # merge LoRA network to the model: faster than apply_to, but requires back-up of original weights (or unmerge_to)
+    logger.info(f"merge LoRA network to the model")
+    lora_network.merge_to(multiplier=1.0)
+    logger.info(f"create image with LoRA")
+    seed_everything(args.seed)
+    image = pipe(args.prompt, negative_prompt=args.negative_prompt).images[0]
+    image.save(image_prefix + "merged_lora.png")
+    # restore (unmerge) LoRA weights: numerically unstable
+    # マージされた重みを元に戻す。計算誤差のため、元の重みと完全に一致しないことがあるかもしれない
+    # 保存したstate_dictから元の重みを復元するのが確実
+    logger.info(f"restore (unmerge) LoRA weights")
+    lora_network.restore_from(multiplier=1.0)
+    logger.info(f"create image without LoRA")
+    seed_everything(args.seed)
+    image = pipe(args.prompt, negative_prompt=args.negative_prompt).images[0]
+    image.save(image_prefix + "unmerged_lora.png")
+    # restore original weights
+    logger.info(f"restore original weights")
+    pipe.unet.load_state_dict(org_unet_sd)
+    pipe.text_encoder.load_state_dict(org_text_encoder_sd)
+    if args.sdxl:
+        pipe.text_encoder_2.load_state_dict(org_text_encoder_2_sd)
+    logger.info(f"create image with restored original weights")
+    seed_everything(args.seed)
+    image = pipe(args.prompt, negative_prompt=args.negative_prompt).images[0]
+    image.save(image_prefix + "restore_original.png")
+    # use convenience function to merge LoRA weights
+    logger.info(f"merge LoRA weights with convenience function")
+    merge_lora_weights(pipe, lora_sd, multiplier=1.0)
+    logger.info(f"create image with merged LoRA weights")
+    seed_everything(args.seed)
+    image = pipe(args.prompt, negative_prompt=args.negative_prompt).images[0]
+    image.save(image_prefix + "convenience_merged_lora.png")

lora_fa.py ADDED Viewed

	@@ -0,0 +1,1244 @@

+# LoRA network module
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+# temporary implementation of LoRA-FA: https://arxiv.org/abs/2308.03303
+# need to be refactored and merged to lora.py
+import math
+import os
+from typing import Dict, List, Optional, Tuple, Type, Union
+from diffusers import AutoencoderKL
+from transformers import CLIPTextModel
+import numpy as np
+import torch
+import re
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+RE_UPDOWN = re.compile(r"(up|down)_blocks_(\d+)_(resnets|upsamplers|downsamplers|attentions)_(\d+)_")
+class LoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        dropout=None,
+        rank_dropout=None,
+        module_dropout=None,
+    ):
+        """if alpha == 0 or None, alpha is rank (no scaling)."""
+        super().__init__()
+        self.lora_name = lora_name
+        if org_module.__class__.__name__ == "Conv2d":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+        # if limit_rank:
+        #   self.lora_dim = min(lora_dim, in_dim, out_dim)
+        #   if self.lora_dim != lora_dim:
+        #     logger.info(f"{lora_name} dim (rank) is changed to: {self.lora_dim}")
+        # else:
+        self.lora_dim = lora_dim
+        if org_module.__class__.__name__ == "Conv2d":
+            kernel_size = org_module.kernel_size
+            stride = org_module.stride
+            padding = org_module.padding
+            self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+            self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+        else:
+            self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
+            self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
+        # # same as microsoft's
+        # torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+        # according to the paper, initialize LoRA-A (down) as normal distribution
+        torch.nn.init.normal_(self.lora_down.weight, std=math.sqrt(2.0 / (in_dim + self.lora_dim)))
+        torch.nn.init.zeros_(self.lora_up.weight)
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+    def get_trainable_params(self):
+        params = self.named_parameters()
+        trainable_params = []
+        for param in params:
+            if param[0] == "lora_up.weight":  # up only
+                trainable_params.append(param[1])
+        return trainable_params
+    def requires_grad_(self, requires_grad: bool = True):
+        self.lora_up.requires_grad_(requires_grad)
+        self.lora_down.requires_grad_(False)
+        return self
+    def apply_to(self):
+        self.org_forward = self.org_module.forward
+        self.org_module.forward = self.forward
+        del self.org_module
+    def forward(self, x):
+        org_forwarded = self.org_forward(x)
+        # module dropout
+        if self.module_dropout is not None and self.training:
+            if torch.rand(1) < self.module_dropout:
+                return org_forwarded
+        lx = self.lora_down(x)
+        # normal dropout
+        if self.dropout is not None and self.training:
+            lx = torch.nn.functional.dropout(lx, p=self.dropout)
+        # rank dropout
+        if self.rank_dropout is not None and self.training:
+            mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
+            if len(lx.size()) == 3:
+                mask = mask.unsqueeze(1)  # for Text Encoder
+            elif len(lx.size()) == 4:
+                mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
+            lx = lx * mask
+            # scaling for rank dropout: treat as if the rank is changed
+            # maskから計算することも考えられるが、augmentation的な効果を期待してrank_dropoutを用いる
+            scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+        else:
+            scale = self.scale
+        lx = self.lora_up(lx)
+        return org_forwarded + lx * self.multiplier * scale
+class LoRAInfModule(LoRAModule):
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        **kwargs,
+    ):
+        # no dropout for inference
+        super().__init__(lora_name, org_module, multiplier, lora_dim, alpha)
+        self.org_module_ref = [org_module]  # 後から参照できるように
+        self.enabled = True
+        # check regional or not by lora_name
+        self.text_encoder = False
+        if lora_name.startswith("lora_te_"):
+            self.regional = False
+            self.use_sub_prompt = True
+            self.text_encoder = True
+        elif "attn2_to_k" in lora_name or "attn2_to_v" in lora_name:
+            self.regional = False
+            self.use_sub_prompt = True
+        elif "time_emb" in lora_name:
+            self.regional = False
+            self.use_sub_prompt = False
+        else:
+            self.regional = True
+            self.use_sub_prompt = False
+        self.network: LoRANetwork = None
+    def set_network(self, network):
+        self.network = network
+    # freezeしてマージする
+    def merge_to(self, sd, dtype, device):
+        # get up/down weight
+        up_weight = sd["lora_up.weight"].to(torch.float).to(device)
+        down_weight = sd["lora_down.weight"].to(torch.float).to(device)
+        # extract weight from org_module
+        org_sd = self.org_module.state_dict()
+        weight = org_sd["weight"].to(torch.float)
+        # merge weight
+        if len(weight.size()) == 2:
+            # linear
+            weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                weight
+                + self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+            weight = weight + self.multiplier * conved * self.scale
+        # set weight to org_module
+        org_sd["weight"] = weight.to(dtype)
+        self.org_module.load_state_dict(org_sd)
+    # 復元できるマージのため、このモジュールのweightを返す
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
+        # get up/down weight from module
+        up_weight = self.lora_up.weight.to(torch.float)
+        down_weight = self.lora_down.weight.to(torch.float)
+        # pre-calculated weight
+        if len(down_weight.size()) == 2:
+            # linear
+            weight = self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            weight = self.multiplier * conved * self.scale
+        return weight
+    def set_region(self, region):
+        self.region = region
+        self.region_mask = None
+    def default_forward(self, x):
+        # logger.info("default_forward", self.lora_name, x.size())
+        return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+    def forward(self, x):
+        if not self.enabled:
+            return self.org_forward(x)
+        if self.network is None or self.network.sub_prompt_index is None:
+            return self.default_forward(x)
+        if not self.regional and not self.use_sub_prompt:
+            return self.default_forward(x)
+        if self.regional:
+            return self.regional_forward(x)
+        else:
+            return self.sub_prompt_forward(x)
+    def get_mask_for_x(self, x):
+        # calculate size from shape of x
+        if len(x.size()) == 4:
+            h, w = x.size()[2:4]
+            area = h * w
+        else:
+            area = x.size()[1]
+        mask = self.network.mask_dic[area]
+        if mask is None:
+            raise ValueError(f"mask is None for resolution {area}")
+        if len(x.size()) != 4:
+            mask = torch.reshape(mask, (1, -1, 1))
+        return mask
+    def regional_forward(self, x):
+        if "attn2_to_out" in self.lora_name:
+            return self.to_out_forward(x)
+        if self.network.mask_dic is None:  # sub_prompt_index >= 3
+            return self.default_forward(x)
+        # apply mask for LoRA result
+        lx = self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+        mask = self.get_mask_for_x(lx)
+        # logger.info("regional", self.lora_name, self.network.sub_prompt_index, lx.size(), mask.size())
+        lx = lx * mask
+        x = self.org_forward(x)
+        x = x + lx
+        if "attn2_to_q" in self.lora_name and self.network.is_last_network:
+            x = self.postp_to_q(x)
+        return x
+    def postp_to_q(self, x):
+        # repeat x to num_sub_prompts
+        has_real_uncond = x.size()[0] // self.network.batch_size == 3
+        qc = self.network.batch_size  # uncond
+        qc += self.network.batch_size * self.network.num_sub_prompts  # cond
+        if has_real_uncond:
+            qc += self.network.batch_size  # real_uncond
+        query = torch.zeros((qc, x.size()[1], x.size()[2]), device=x.device, dtype=x.dtype)
+        query[: self.network.batch_size] = x[: self.network.batch_size]
+        for i in range(self.network.batch_size):
+            qi = self.network.batch_size + i * self.network.num_sub_prompts
+            query[qi : qi + self.network.num_sub_prompts] = x[self.network.batch_size + i]
+        if has_real_uncond:
+            query[-self.network.batch_size :] = x[-self.network.batch_size :]
+        # logger.info("postp_to_q", self.lora_name, x.size(), query.size(), self.network.num_sub_prompts)
+        return query
+    def sub_prompt_forward(self, x):
+        if x.size()[0] == self.network.batch_size:  # if uncond in text_encoder, do not apply LoRA
+            return self.org_forward(x)
+        emb_idx = self.network.sub_prompt_index
+        if not self.text_encoder:
+            emb_idx += self.network.batch_size
+        # apply sub prompt of X
+        lx = x[emb_idx :: self.network.num_sub_prompts]
+        lx = self.lora_up(self.lora_down(lx)) * self.multiplier * self.scale
+        # logger.info("sub_prompt_forward", self.lora_name, x.size(), lx.size(), emb_idx)
+        x = self.org_forward(x)
+        x[emb_idx :: self.network.num_sub_prompts] += lx
+        return x
+    def to_out_forward(self, x):
+        # logger.info("to_out_forward", self.lora_name, x.size(), self.network.is_last_network)
+        if self.network.is_last_network:
+            masks = [None] * self.network.num_sub_prompts
+            self.network.shared[self.lora_name] = (None, masks)
+        else:
+            lx, masks = self.network.shared[self.lora_name]
+        # call own LoRA
+        x1 = x[self.network.batch_size + self.network.sub_prompt_index :: self.network.num_sub_prompts]
+        lx1 = self.lora_up(self.lora_down(x1)) * self.multiplier * self.scale
+        if self.network.is_last_network:
+            lx = torch.zeros(
+                (self.network.num_sub_prompts * self.network.batch_size, *lx1.size()[1:]), device=lx1.device, dtype=lx1.dtype
+            )
+            self.network.shared[self.lora_name] = (lx, masks)
+        # logger.info("to_out_forward", lx.size(), lx1.size(), self.network.sub_prompt_index, self.network.num_sub_prompts)
+        lx[self.network.sub_prompt_index :: self.network.num_sub_prompts] += lx1
+        masks[self.network.sub_prompt_index] = self.get_mask_for_x(lx1)
+        # if not last network, return x and masks
+        x = self.org_forward(x)
+        if not self.network.is_last_network:
+            return x
+        lx, masks = self.network.shared.pop(self.lora_name)
+        # if last network, combine separated x with mask weighted sum
+        has_real_uncond = x.size()[0] // self.network.batch_size == self.network.num_sub_prompts + 2
+        out = torch.zeros((self.network.batch_size * (3 if has_real_uncond else 2), *x.size()[1:]), device=x.device, dtype=x.dtype)
+        out[: self.network.batch_size] = x[: self.network.batch_size]  # uncond
+        if has_real_uncond:
+            out[-self.network.batch_size :] = x[-self.network.batch_size :]  # real_uncond
+        # logger.info("to_out_forward", self.lora_name, self.network.sub_prompt_index, self.network.num_sub_prompts)
+        # for i in range(len(masks)):
+        #     if masks[i] is None:
+        #         masks[i] = torch.zeros_like(masks[-1])
+        mask = torch.cat(masks)
+        mask_sum = torch.sum(mask, dim=0) + 1e-4
+        for i in range(self.network.batch_size):
+            # 1枚の画像ごとに処理する
+            lx1 = lx[i * self.network.num_sub_prompts : (i + 1) * self.network.num_sub_prompts]
+            lx1 = lx1 * mask
+            lx1 = torch.sum(lx1, dim=0)
+            xi = self.network.batch_size + i * self.network.num_sub_prompts
+            x1 = x[xi : xi + self.network.num_sub_prompts]
+            x1 = x1 * mask
+            x1 = torch.sum(x1, dim=0)
+            x1 = x1 / mask_sum
+            x1 = x1 + lx1
+            out[self.network.batch_size + i] = x1
+        # logger.info("to_out_forward", x.size(), out.size(), has_real_uncond)
+        return out
+def parse_block_lr_kwargs(nw_kwargs):
+    down_lr_weight = nw_kwargs.get("down_lr_weight", None)
+    mid_lr_weight = nw_kwargs.get("mid_lr_weight", None)
+    up_lr_weight = nw_kwargs.get("up_lr_weight", None)
+    # 以上のいずれにも設定がない場合は無効としてNoneを返す
+    if down_lr_weight is None and mid_lr_weight is None and up_lr_weight is None:
+        return None, None, None
+    # extract learning rate weight for each block
+    if down_lr_weight is not None:
+        # if some parameters are not set, use zero
+        if "," in down_lr_weight:
+            down_lr_weight = [(float(s) if s else 0.0) for s in down_lr_weight.split(",")]
+    if mid_lr_weight is not None:
+        mid_lr_weight = float(mid_lr_weight)
+    if up_lr_weight is not None:
+        if "," in up_lr_weight:
+            up_lr_weight = [(float(s) if s else 0.0) for s in up_lr_weight.split(",")]
+    down_lr_weight, mid_lr_weight, up_lr_weight = get_block_lr_weight(
+        down_lr_weight, mid_lr_weight, up_lr_weight, float(nw_kwargs.get("block_lr_zero_threshold", 0.0))
+    )
+    return down_lr_weight, mid_lr_weight, up_lr_weight
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: AutoencoderKL,
+    text_encoder: Union[CLIPTextModel, List[CLIPTextModel]],
+    unet,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+    # extract dim/alpha for conv2d, and block dim
+    conv_dim = kwargs.get("conv_dim", None)
+    conv_alpha = kwargs.get("conv_alpha", None)
+    if conv_dim is not None:
+        conv_dim = int(conv_dim)
+        if conv_alpha is None:
+            conv_alpha = 1.0
+        else:
+            conv_alpha = float(conv_alpha)
+    # block dim/alpha/lr
+    block_dims = kwargs.get("block_dims", None)
+    down_lr_weight, mid_lr_weight, up_lr_weight = parse_block_lr_kwargs(kwargs)
+    # 以上のいずれかに指定があればblockごとのdim(rank)を有効にする
+    if block_dims is not None or down_lr_weight is not None or mid_lr_weight is not None or up_lr_weight is not None:
+        block_alphas = kwargs.get("block_alphas", None)
+        conv_block_dims = kwargs.get("conv_block_dims", None)
+        conv_block_alphas = kwargs.get("conv_block_alphas", None)
+        block_dims, block_alphas, conv_block_dims, conv_block_alphas = get_block_dims_and_alphas(
+            block_dims, block_alphas, network_dim, network_alpha, conv_block_dims, conv_block_alphas, conv_dim, conv_alpha
+        )
+        # remove block dim/alpha without learning rate
+        block_dims, block_alphas, conv_block_dims, conv_block_alphas = remove_block_dims_and_alphas(
+            block_dims, block_alphas, conv_block_dims, conv_block_alphas, down_lr_weight, mid_lr_weight, up_lr_weight
+        )
+    else:
+        block_alphas = None
+        conv_block_dims = None
+        conv_block_alphas = None
+    # rank/module dropout
+    rank_dropout = kwargs.get("rank_dropout", None)
+    if rank_dropout is not None:
+        rank_dropout = float(rank_dropout)
+    module_dropout = kwargs.get("module_dropout", None)
+    if module_dropout is not None:
+        module_dropout = float(module_dropout)
+    # すごく引数が多いな ( ^ω^)･･･
+    network = LoRANetwork(
+        text_encoder,
+        unet,
+        multiplier=multiplier,
+        lora_dim=network_dim,
+        alpha=network_alpha,
+        dropout=neuron_dropout,
+        rank_dropout=rank_dropout,
+        module_dropout=module_dropout,
+        conv_lora_dim=conv_dim,
+        conv_alpha=conv_alpha,
+        block_dims=block_dims,
+        block_alphas=block_alphas,
+        conv_block_dims=conv_block_dims,
+        conv_block_alphas=conv_block_alphas,
+        varbose=True,
+    )
+    if up_lr_weight is not None or mid_lr_weight is not None or down_lr_weight is not None:
+        network.set_block_lr_weight(up_lr_weight, mid_lr_weight, down_lr_weight)
+    return network
+# このメソッドは外部から呼び出される可能性を考慮しておく
+# network_dim, network_alpha にはデフォルト値が入っている。
+# block_dims, block_alphas は両方ともNoneまたは両方とも値が入っている
+# conv_dim, conv_alpha は両方ともNoneまたは両方とも値が入っている
+def get_block_dims_and_alphas(
+    block_dims, block_alphas, network_dim, network_alpha, conv_block_dims, conv_block_alphas, conv_dim, conv_alpha
+):
+    num_total_blocks = LoRANetwork.NUM_OF_BLOCKS * 2 + 1
+    def parse_ints(s):
+        return [int(i) for i in s.split(",")]
+    def parse_floats(s):
+        return [float(i) for i in s.split(",")]
+    # block_dimsとblock_alphasをパースする。必ず値が入る
+    if block_dims is not None:
+        block_dims = parse_ints(block_dims)
+        assert (
+            len(block_dims) == num_total_blocks
+        ), f"block_dims must have {num_total_blocks} elements / block_dimsは{num_total_blocks}個指定してください"
+    else:
+        logger.warning(f"block_dims is not specified. all dims are set to {network_dim} / block_dimsが指定されていません。すべてのdimは{network_dim}になります")
+        block_dims = [network_dim] * num_total_blocks
+    if block_alphas is not None:
+        block_alphas = parse_floats(block_alphas)
+        assert (
+            len(block_alphas) == num_total_blocks
+        ), f"block_alphas must have {num_total_blocks} elements / block_alphasは{num_total_blocks}個指定してください"
+    else:
+        logger.warning(
+            f"block_alphas is not specified. all alphas are set to {network_alpha} / block_alphasが指定されていません。すべてのalphaは{network_alpha}になります"
+        )
+        block_alphas = [network_alpha] * num_total_blocks
+    # conv_block_dimsとconv_block_alphasを、指定がある場合のみパースする。指定がなければconv_dimとconv_alphaを使う
+    if conv_block_dims is not None:
+        conv_block_dims = parse_ints(conv_block_dims)
+        assert (
+            len(conv_block_dims) == num_total_blocks
+        ), f"conv_block_dims must have {num_total_blocks} elements / conv_block_dimsは{num_total_blocks}個指定してください"
+        if conv_block_alphas is not None:
+            conv_block_alphas = parse_floats(conv_block_alphas)
+            assert (
+                len(conv_block_alphas) == num_total_blocks
+            ), f"conv_block_alphas must have {num_total_blocks} elements / conv_block_alphasは{num_total_blocks}個指定してください"
+        else:
+            if conv_alpha is None:
+                conv_alpha = 1.0
+            logger.warning(
+                f"conv_block_alphas is not specified. all alphas are set to {conv_alpha} / conv_block_alphasが指定されていません。すべてのalphaは{conv_alpha}になります"
+            )
+            conv_block_alphas = [conv_alpha] * num_total_blocks
+    else:
+        if conv_dim is not None:
+            logger.warning(
+                f"conv_dim/alpha for all blocks are set to {conv_dim} and {conv_alpha} / すべてのブロックのconv_dimとalphaは{conv_dim}および{conv_alpha}になります"
+            )
+            conv_block_dims = [conv_dim] * num_total_blocks
+            conv_block_alphas = [conv_alpha] * num_total_blocks
+        else:
+            conv_block_dims = None
+            conv_block_alphas = None
+    return block_dims, block_alphas, conv_block_dims, conv_block_alphas
+# 層別学習率用に層ごとの学習率に対する倍率を定義する、外部から呼び出される可能性を考慮しておく
+def get_block_lr_weight(
+    down_lr_weight, mid_lr_weight, up_lr_weight, zero_threshold
+) -> Tuple[List[float], List[float], List[float]]:
+    # パラメータ未指定時は何もせず、今までと同じ動作とする
+    if up_lr_weight is None and mid_lr_weight is None and down_lr_weight is None:
+        return None, None, None
+    max_len = LoRANetwork.NUM_OF_BLOCKS  # フルモデル相当でのup,downの層の数
+    def get_list(name_with_suffix) -> List[float]:
+        import math
+        tokens = name_with_suffix.split("+")
+        name = tokens[0]
+        base_lr = float(tokens[1]) if len(tokens) > 1 else 0.0
+        if name == "cosine":
+            return [math.sin(math.pi * (i / (max_len - 1)) / 2) + base_lr for i in reversed(range(max_len))]
+        elif name == "sine":
+            return [math.sin(math.pi * (i / (max_len - 1)) / 2) + base_lr for i in range(max_len)]
+        elif name == "linear":
+            return [i / (max_len - 1) + base_lr for i in range(max_len)]
+        elif name == "reverse_linear":
+            return [i / (max_len - 1) + base_lr for i in reversed(range(max_len))]
+        elif name == "zeros":
+            return [0.0 + base_lr] * max_len
+        else:
+            logger.error(
+                "Unknown lr_weight argument %s is used. Valid arguments:  / 不明なlr_weightの引数 %s が使われました。有効な引数:\n\tcosine, sine, linear, reverse_linear, zeros"
+                % (name)
+            )
+            return None
+    if type(down_lr_weight) == str:
+        down_lr_weight = get_list(down_lr_weight)
+    if type(up_lr_weight) == str:
+        up_lr_weight = get_list(up_lr_weight)
+    if (up_lr_weight != None and len(up_lr_weight) > max_len) or (down_lr_weight != None and len(down_lr_weight) > max_len):
+        logger.warning("down_weight or up_weight is too long. Parameters after %d-th are ignored." % max_len)
+        logger.warning("down_weightもしくはup_weightが長すぎます。%d個目以降のパラメータは無視されます。" % max_len)
+        up_lr_weight = up_lr_weight[:max_len]
+        down_lr_weight = down_lr_weight[:max_len]
+    if (up_lr_weight != None and len(up_lr_weight) < max_len) or (down_lr_weight != None and len(down_lr_weight) < max_len):
+        logger.warning("down_weight or up_weight is too short. Parameters after %d-th are filled with 1." % max_len)
+        logger.warning("down_weightもしくはup_weightが短すぎます。%d個目までの不足したパラメータは1で補われます。" % max_len)
+        if down_lr_weight != None and len(down_lr_weight) < max_len:
+            down_lr_weight = down_lr_weight + [1.0] * (max_len - len(down_lr_weight))
+        if up_lr_weight != None and len(up_lr_weight) < max_len:
+            up_lr_weight = up_lr_weight + [1.0] * (max_len - len(up_lr_weight))
+    if (up_lr_weight != None) or (mid_lr_weight != None) or (down_lr_weight != None):
+        logger.info("apply block learning rate / 階層別学習率を適用します。")
+        if down_lr_weight != None:
+            down_lr_weight = [w if w > zero_threshold else 0 for w in down_lr_weight]
+            logger.info(f"down_lr_weight (shallower -> deeper, 浅い層->深い層): {down_lr_weight}")
+        else:
+            logger.info("down_lr_weight: all 1.0, すべて1.0")
+        if mid_lr_weight != None:
+            mid_lr_weight = mid_lr_weight if mid_lr_weight > zero_threshold else 0
+            logger.info(f"mid_lr_weight: {mid_lr_weight}")
+        else:
+            logger.info("mid_lr_weight: 1.0")
+        if up_lr_weight != None:
+            up_lr_weight = [w if w > zero_threshold else 0 for w in up_lr_weight]
+            logger.info(f"up_lr_weight (deeper -> shallower, 深い層->浅い層): {up_lr_weight}")
+        else:
+            logger.info("up_lr_weight: all 1.0, すべて1.0")
+    return down_lr_weight, mid_lr_weight, up_lr_weight
+# lr_weightが0のblockをblock_dimsから除外する、外部から呼び出す可能性を考慮しておく
+def remove_block_dims_and_alphas(
+    block_dims, block_alphas, conv_block_dims, conv_block_alphas, down_lr_weight, mid_lr_weight, up_lr_weight
+):
+    # set 0 to block dim without learning rate to remove the block
+    if down_lr_weight != None:
+        for i, lr in enumerate(down_lr_weight):
+            if lr == 0:
+                block_dims[i] = 0
+                if conv_block_dims is not None:
+                    conv_block_dims[i] = 0
+    if mid_lr_weight != None:
+        if mid_lr_weight == 0:
+            block_dims[LoRANetwork.NUM_OF_BLOCKS] = 0
+            if conv_block_dims is not None:
+                conv_block_dims[LoRANetwork.NUM_OF_BLOCKS] = 0
+    if up_lr_weight != None:
+        for i, lr in enumerate(up_lr_weight):
+            if lr == 0:
+                block_dims[LoRANetwork.NUM_OF_BLOCKS + 1 + i] = 0
+                if conv_block_dims is not None:
+                    conv_block_dims[LoRANetwork.NUM_OF_BLOCKS + 1 + i] = 0
+    return block_dims, block_alphas, conv_block_dims, conv_block_alphas
+# 外部から呼び出す可能性を考慮しておく
+def get_block_index(lora_name: str) -> int:
+    block_idx = -1  # invalid lora name
+    m = RE_UPDOWN.search(lora_name)
+    if m:
+        g = m.groups()
+        i = int(g[1])
+        j = int(g[3])
+        if g[2] == "resnets":
+            idx = 3 * i + j
+        elif g[2] == "attentions":
+            idx = 3 * i + j
+        elif g[2] == "upsamplers" or g[2] == "downsamplers":
+            idx = 3 * i + 2
+        if g[0] == "down":
+            block_idx = 1 + idx  # 0に該当するLoRAは存在しない
+        elif g[0] == "up":
+            block_idx = LoRANetwork.NUM_OF_BLOCKS + 1 + idx
+    elif "mid_block_" in lora_name:
+        block_idx = LoRANetwork.NUM_OF_BLOCKS  # idx=12
+    return block_idx
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weights_sd=None, for_inference=False, **kwargs):
+    if weights_sd is None:
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file, safe_open
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+    # get dim/alpha mapping
+    modules_dim = {}
+    modules_alpha = {}
+    for key, value in weights_sd.items():
+        if "." not in key:
+            continue
+        lora_name = key.split(".")[0]
+        if "alpha" in key:
+            modules_alpha[lora_name] = value
+        elif "lora_down" in key:
+            dim = value.size()[0]
+            modules_dim[lora_name] = dim
+            # logger.info(lora_name, value.size(), dim)
+    # support old LoRA without alpha
+    for key in modules_dim.keys():
+        if key not in modules_alpha:
+            modules_alpha[key] = modules_dim[key]
+    module_class = LoRAInfModule if for_inference else LoRAModule
+    network = LoRANetwork(
+        text_encoder, unet, multiplier=multiplier, modules_dim=modules_dim, modules_alpha=modules_alpha, module_class=module_class
+    )
+    # block lr
+    down_lr_weight, mid_lr_weight, up_lr_weight = parse_block_lr_kwargs(kwargs)
+    if up_lr_weight is not None or mid_lr_weight is not None or down_lr_weight is not None:
+        network.set_block_lr_weight(up_lr_weight, mid_lr_weight, down_lr_weight)
+    return network, weights_sd
+class LoRANetwork(torch.nn.Module):
+    NUM_OF_BLOCKS = 12  # フルモデル相当でのup,downの層の数
+    UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel"]
+    UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP"]
+    LORA_PREFIX_UNET = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"
+    # SDXL: must starts with LORA_PREFIX_TEXT_ENCODER
+    LORA_PREFIX_TEXT_ENCODER1 = "lora_te1"
+    LORA_PREFIX_TEXT_ENCODER2 = "lora_te2"
+    def __init__(
+        self,
+        text_encoder: Union[List[CLIPTextModel], CLIPTextModel],
+        unet,
+        multiplier: float = 1.0,
+        lora_dim: int = 4,
+        alpha: float = 1,
+        dropout: Optional[float] = None,
+        rank_dropout: Optional[float] = None,
+        module_dropout: Optional[float] = None,
+        conv_lora_dim: Optional[int] = None,
+        conv_alpha: Optional[float] = None,
+        block_dims: Optional[List[int]] = None,
+        block_alphas: Optional[List[float]] = None,
+        conv_block_dims: Optional[List[int]] = None,
+        conv_block_alphas: Optional[List[float]] = None,
+        modules_dim: Optional[Dict[str, int]] = None,
+        modules_alpha: Optional[Dict[str, int]] = None,
+        module_class: Type[object] = LoRAModule,
+        varbose: Optional[bool] = False,
+    ) -> None:
+        """
+        LoRA network: すごく引数が多いが、パターンは以下の通り
+        1. lora_dimとalphaを指定
+        2. lora_dim、alpha、conv_lora_dim、conv_alphaを指定
+        3. block_dimsとblock_alphasを指定 :  Conv2d3x3には適用しない
+        4. block_dims、block_alphas、conv_block_dims、conv_block_alphasを指定 : Conv2d3x3にも適用する
+        5. modules_dimとmodules_alphaを指定 (推論用)
+        """
+        super().__init__()
+        self.multiplier = multiplier
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        self.conv_lora_dim = conv_lora_dim
+        self.conv_alpha = conv_alpha
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+        if modules_dim is not None:
+            logger.info(f"create LoRA network from weights")
+        elif block_dims is not None:
+            logger.info(f"create LoRA network from block_dims")
+            logger.info(f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}")
+            logger.info(f"block_dims: {block_dims}")
+            logger.info(f"block_alphas: {block_alphas}")
+            if conv_block_dims is not None:
+                logger.info(f"conv_block_dims: {conv_block_dims}")
+                logger.info(f"conv_block_alphas: {conv_block_alphas}")
+        else:
+            logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
+            logger.info(f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}")
+            if self.conv_lora_dim is not None:
+                logger.info(f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}")
+        # create module instances
+        def create_modules(
+            is_unet: bool,
+            text_encoder_idx: Optional[int],  # None, 1, 2
+            root_module: torch.nn.Module,
+            target_replace_modules: List[torch.nn.Module],
+        ) -> List[LoRAModule]:
+            prefix = (
+                self.LORA_PREFIX_UNET
+                if is_unet
+                else (
+                    self.LORA_PREFIX_TEXT_ENCODER
+                    if text_encoder_idx is None
+                    else (self.LORA_PREFIX_TEXT_ENCODER1 if text_encoder_idx == 1 else self.LORA_PREFIX_TEXT_ENCODER2)
+                )
+            )
+            loras = []
+            skipped = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+                        if is_linear or is_conv2d:
+                            lora_name = prefix + "." + name + "." + child_name
+                            lora_name = lora_name.replace(".", "_")
+                            dim = None
+                            alpha = None
+                            if modules_dim is not None:
+                                # モジュール指定あり
+                                if lora_name in modules_dim:
+                                    dim = modules_dim[lora_name]
+                                    alpha = modules_alpha[lora_name]
+                            elif is_unet and block_dims is not None:
+                                # U-Netでblock_dims指定あり
+                                block_idx = get_block_index(lora_name)
+                                if is_linear or is_conv2d_1x1:
+                                    dim = block_dims[block_idx]
+                                    alpha = block_alphas[block_idx]
+                                elif conv_block_dims is not None:
+                                    dim = conv_block_dims[block_idx]
+                                    alpha = conv_block_alphas[block_idx]
+                            else:
+                                # 通常、すべて対象とする
+                                if is_linear or is_conv2d_1x1:
+                                    dim = self.lora_dim
+                                    alpha = self.alpha
+                                elif self.conv_lora_dim is not None:
+                                    dim = self.conv_lora_dim
+                                    alpha = self.conv_alpha
+                            if dim is None or dim == 0:
+                                # skipした情報を出力
+                                if is_linear or is_conv2d_1x1 or (self.conv_lora_dim is not None or conv_block_dims is not None):
+                                    skipped.append(lora_name)
+                                continue
+                            lora = module_class(
+                                lora_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                                dropout=dropout,
+                                rank_dropout=rank_dropout,
+                                module_dropout=module_dropout,
+                            )
+                            loras.append(lora)
+            return loras, skipped
+        text_encoders = text_encoder if type(text_encoder) == list else [text_encoder]
+        # create LoRA for text encoder
+        # 毎回すべてのモジュールを作るのは無駄なので要検討
+        self.text_encoder_loras = []
+        skipped_te = []
+        for i, text_encoder in enumerate(text_encoders):
+            if len(text_encoders) > 1:
+                index = i + 1
+                logger.info(f"create LoRA for Text Encoder {index}:")
+            else:
+                index = None
+                logger.info(f"create LoRA for Text Encoder:")
+            text_encoder_loras, skipped = create_modules(False, index, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+            self.text_encoder_loras.extend(text_encoder_loras)
+            skipped_te += skipped
+        logger.info(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
+        # extend U-Net target modules if conv2d 3x3 is enabled, or load from weights
+        target_modules = LoRANetwork.UNET_TARGET_REPLACE_MODULE
+        if modules_dim is not None or self.conv_lora_dim is not None or conv_block_dims is not None:
+            target_modules += LoRANetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
+        self.unet_loras, skipped_un = create_modules(True, None, unet, target_modules)
+        logger.info(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
+        skipped = skipped_te + skipped_un
+        if varbose and len(skipped) > 0:
+            logger.warning(
+                f"because block_lr_weight is 0 or dim (rank) is 0, {len(skipped)} LoRA modules are skipped / block_lr_weightまたはdim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
+            )
+            for name in skipped:
+                logger.info(f"\t{name}")
+        self.up_lr_weight: List[float] = None
+        self.down_lr_weight: List[float] = None
+        self.mid_lr_weight: float = None
+        self.block_lr = False
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+        info = self.load_state_dict(weights_sd, False)
+        return info
+    def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True):
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+    # TODO refactor to common function with apply_to
+    def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
+        apply_text_encoder = apply_unet = False
+        for key in weights_sd.keys():
+            if key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER):
+                apply_text_encoder = True
+            elif key.startswith(LoRANetwork.LORA_PREFIX_UNET):
+                apply_unet = True
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            sd_for_lora = {}
+            for key in weights_sd.keys():
+                if key.startswith(lora.lora_name):
+                    sd_for_lora[key[len(lora.lora_name) + 1 :]] = weights_sd[key]
+            lora.merge_to(sd_for_lora, dtype, device)
+        logger.info(f"weights are merged")
+    # 層別学習率用に層ごとの学習率に対する倍率を定義する　引数の順番が逆だがとりあえず気にしない
+    def set_block_lr_weight(
+        self,
+        up_lr_weight: List[float] = None,
+        mid_lr_weight: float = None,
+        down_lr_weight: List[float] = None,
+    ):
+        self.block_lr = True
+        self.down_lr_weight = down_lr_weight
+        self.mid_lr_weight = mid_lr_weight
+        self.up_lr_weight = up_lr_weight
+    def get_lr_weight(self, lora: LoRAModule) -> float:
+        lr_weight = 1.0
+        block_idx = get_block_index(lora.lora_name)
+        if block_idx < 0:
+            return lr_weight
+        if block_idx < LoRANetwork.NUM_OF_BLOCKS:
+            if self.down_lr_weight != None:
+                lr_weight = self.down_lr_weight[block_idx]
+        elif block_idx == LoRANetwork.NUM_OF_BLOCKS:
+            if self.mid_lr_weight != None:
+                lr_weight = self.mid_lr_weight
+        elif block_idx > LoRANetwork.NUM_OF_BLOCKS:
+            if self.up_lr_weight != None:
+                lr_weight = self.up_lr_weight[block_idx - LoRANetwork.NUM_OF_BLOCKS - 1]
+        return lr_weight
+    # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+        self.requires_grad_(True)
+        all_params = []
+        def enumerate_params(loras: List[LoRAModule]):
+            params = []
+            for lora in loras:
+                # params.extend(lora.parameters())
+                params.extend(lora.get_trainable_params())
+            return params
+        if self.text_encoder_loras:
+            param_data = {"params": enumerate_params(self.text_encoder_loras)}
+            if text_encoder_lr is not None:
+                param_data["lr"] = text_encoder_lr
+            all_params.append(param_data)
+        if self.unet_loras:
+            if self.block_lr:
+                # 学習率のグラフをblockごとにしたいので、blockごとにloraを分類
+                block_idx_to_lora = {}
+                for lora in self.unet_loras:
+                    idx = get_block_index(lora.lora_name)
+                    if idx not in block_idx_to_lora:
+                        block_idx_to_lora[idx] = []
+                    block_idx_to_lora[idx].append(lora)
+                # blockごとにパラメータを設定する
+                for idx, block_loras in block_idx_to_lora.items():
+                    param_data = {"params": enumerate_params(block_loras)}
+                    if unet_lr is not None:
+                        param_data["lr"] = unet_lr * self.get_lr_weight(block_loras[0])
+                    elif default_lr is not None:
+                        param_data["lr"] = default_lr * self.get_lr_weight(block_loras[0])
+                    if ("lr" in param_data) and (param_data["lr"] == 0):
+                        continue
+                    all_params.append(param_data)
+            else:
+                param_data = {"params": enumerate_params(self.unet_loras)}
+                if unet_lr is not None:
+                    param_data["lr"] = unet_lr
+                all_params.append(param_data)
+        return all_params
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+    def get_trainable_params(self):
+        return self.parameters()
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+        state_dict = self.state_dict()
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from library import train_util
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+    # mask is a tensor with values from 0 to 1
+    def set_region(self, sub_prompt_index, is_last_network, mask):
+        if mask.max() == 0:
+            mask = torch.ones_like(mask)
+        self.mask = mask
+        self.sub_prompt_index = sub_prompt_index
+        self.is_last_network = is_last_network
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.set_network(self)
+    def set_current_generation(self, batch_size, num_sub_prompts, width, height, shared):
+        self.batch_size = batch_size
+        self.num_sub_prompts = num_sub_prompts
+        self.current_size = (height, width)
+        self.shared = shared
+        # create masks
+        mask = self.mask
+        mask_dic = {}
+        mask = mask.unsqueeze(0).unsqueeze(1)  # b(1),c(1),h,w
+        ref_weight = self.text_encoder_loras[0].lora_down.weight if self.text_encoder_loras else self.unet_loras[0].lora_down.weight
+        dtype = ref_weight.dtype
+        device = ref_weight.device
+        def resize_add(mh, mw):
+            # logger.info(mh, mw, mh * mw)
+            m = torch.nn.functional.interpolate(mask, (mh, mw), mode="bilinear")  # doesn't work in bf16
+            m = m.to(device, dtype=dtype)
+            mask_dic[mh * mw] = m
+        h = height // 8
+        w = width // 8
+        for _ in range(4):
+            resize_add(h, w)
+            if h % 2 == 1 or w % 2 == 1:  # add extra shape if h/w is not divisible by 2
+                resize_add(h + h % 2, w + w % 2)
+            h = (h + 1) // 2
+            w = (w + 1) // 2
+        self.mask_dic = mask_dic
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+    def restore_weights(self):
+        # 重みのリストアを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+    def pre_calculation(self):
+        # 事前計算を行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            sd = org_module.state_dict()
+            org_weight = sd["weight"]
+            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            sd["weight"] = org_weight + lora_weight
+            assert sd["weight"].shape == org_weight.shape
+            org_module.load_state_dict(sd)
+            org_module._lora_restored = False
+            lora.enabled = False
+    def apply_max_norm_regularization(self, max_norm_value, device):
+        downkeys = []
+        upkeys = []
+        alphakeys = []
+        norms = []
+        keys_scaled = 0
+        state_dict = self.state_dict()
+        for key in state_dict.keys():
+            if "lora_down" in key and "weight" in key:
+                downkeys.append(key)
+                upkeys.append(key.replace("lora_down", "lora_up"))
+                alphakeys.append(key.replace("lora_down.weight", "alpha"))
+        for i in range(len(downkeys)):
+            down = state_dict[downkeys[i]].to(device)
+            up = state_dict[upkeys[i]].to(device)
+            alpha = state_dict[alphakeys[i]].to(device)
+            dim = down.shape[0]
+            scale = alpha / dim
+            if up.shape[2:] == (1, 1) and down.shape[2:] == (1, 1):
+                updown = (up.squeeze(2).squeeze(2) @ down.squeeze(2).squeeze(2)).unsqueeze(2).unsqueeze(3)
+            elif up.shape[2:] == (3, 3) or down.shape[2:] == (3, 3):
+                updown = torch.nn.functional.conv2d(down.permute(1, 0, 2, 3), up).permute(1, 0, 2, 3)
+            else:
+                updown = up @ down
+            updown *= scale
+            norm = updown.norm().clamp(min=max_norm_value / 2)
+            desired = torch.clamp(norm, max=max_norm_value)
+            ratio = desired.cpu() / norm.cpu()
+            sqrt_ratio = ratio**0.5
+            if ratio != 1:
+                keys_scaled += 1
+                state_dict[upkeys[i]] *= sqrt_ratio
+                state_dict[downkeys[i]] *= sqrt_ratio
+            scalednorm = updown.norm() * ratio
+            norms.append(scalednorm.item())
+        return keys_scaled, sum(norms) / len(norms), max(norms)

lora_interrogator.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from tqdm import tqdm
+from library import model_util
+import library.train_util as train_util
+import argparse
+from transformers import CLIPTokenizer
+import torch
+from library.device_utils import init_ipex, get_preferred_device
+init_ipex()
+import library.model_util as model_util
+import lora
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+TOKENIZER_PATH = "openai/clip-vit-large-patch14"
+V2_STABLE_DIFFUSION_PATH = "stabilityai/stable-diffusion-2"     # ここからtokenizerだけ使う
+DEVICE = get_preferred_device()
+def interrogate(args):
+  weights_dtype = torch.float16
+  # いろいろ準備する
+  logger.info(f"loading SD model: {args.sd_model}")
+  args.pretrained_model_name_or_path = args.sd_model
+  args.vae = None
+  text_encoder, vae, unet, _ = train_util._load_target_model(args,weights_dtype, DEVICE)
+  logger.info(f"loading LoRA: {args.model}")
+  network, weights_sd = lora.create_network_from_weights(1.0, args.model, vae, text_encoder, unet)
+  # text encoder向けの重みがあるかチェックする：本当はlora側でやるのがいい
+  has_te_weight = False
+  for key in weights_sd.keys():
+    if 'lora_te' in key:
+      has_te_weight = True
+      break
+  if not has_te_weight:
+    logger.error("This LoRA does not have modules for Text Encoder, cannot interrogate / このLoRAはText Encoder向けのモジュールがないため調査できません")
+    return
+  del vae
+  logger.info("loading tokenizer")
+  if args.v2:
+    tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(V2_STABLE_DIFFUSION_PATH, subfolder="tokenizer")
+  else:
+    tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(TOKENIZER_PATH)  # , model_max_length=max_token_length + 2)
+  text_encoder.to(DEVICE, dtype=weights_dtype)
+  text_encoder.eval()
+  unet.to(DEVICE, dtype=weights_dtype)
+  unet.eval()               # U-Netは呼び出さないので不要だけど
+  # トークンをひとつひとつ当たっていく
+  token_id_start = 0
+  token_id_end = max(tokenizer.all_special_ids)
+  logger.info(f"interrogate tokens are: {token_id_start} to {token_id_end}")
+  def get_all_embeddings(text_encoder):
+    embs = []
+    with torch.no_grad():
+      for token_id in tqdm(range(token_id_start, token_id_end + 1, args.batch_size)):
+        batch = []
+        for tid in range(token_id, min(token_id_end + 1, token_id + args.batch_size)):
+          tokens = [tokenizer.bos_token_id, tid, tokenizer.eos_token_id]
+          # tokens = [tid]                                                    # こちらは結果がいまひとつ
+          batch.append(tokens)
+        # batch_embs = text_encoder(torch.tensor(batch).to(DEVICE))[0].to("cpu")  # bos/eosも含めたほうが差が出るようだ [:, 1]
+        # clip skip対応
+        batch = torch.tensor(batch).to(DEVICE)
+        if args.clip_skip is None:
+          encoder_hidden_states = text_encoder(batch)[0]
+        else:
+          enc_out = text_encoder(batch, output_hidden_states=True, return_dict=True)
+          encoder_hidden_states = enc_out['hidden_states'][-args.clip_skip]
+          encoder_hidden_states = text_encoder.text_model.final_layer_norm(encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states.to("cpu")
+        embs.extend(encoder_hidden_states)
+    return torch.stack(embs)
+  logger.info("get original text encoder embeddings.")
+  orig_embs = get_all_embeddings(text_encoder)
+  network.apply_to(text_encoder, unet, True, len(network.unet_loras) > 0)
+  info = network.load_state_dict(weights_sd, strict=False)
+  logger.info(f"Loading LoRA weights: {info}")
+  network.to(DEVICE, dtype=weights_dtype)
+  network.eval()
+  del unet
+  logger.info("You can ignore warning messages start with '_IncompatibleKeys' (LoRA model does not have alpha because trained by older script) / '_IncompatibleKeys'の警告は無視して構いません（以前のスクリプトで学習されたLoRAモデルのためalphaの定義がありません）")
+  logger.info("get text encoder embeddings with lora.")
+  lora_embs = get_all_embeddings(text_encoder)
+  # 比べる：とりあえず単純に差分の絶対値で
+  logger.info("comparing...")
+  diffs = {}
+  for i, (orig_emb, lora_emb) in enumerate(zip(orig_embs, tqdm(lora_embs))):
+    diff = torch.mean(torch.abs(orig_emb - lora_emb))
+    # diff = torch.mean(torch.cosine_similarity(orig_emb, lora_emb, dim=1))       # うまく検出できない
+    diff = float(diff.detach().to('cpu').numpy())
+    diffs[token_id_start + i] = diff
+  diffs_sorted = sorted(diffs.items(), key=lambda x: -x[1])
+  # 結果を表示する
+  print("top 100:")
+  for i, (token, diff) in enumerate(diffs_sorted[:100]):
+    # if diff < 1e-6:
+    #   break
+    string = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens([token]))
+    print(f"[{i:3d}]: {token:5d} {string:<20s}: {diff:.5f}")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--v2", action='store_true',
+                      help='load Stable Diffusion v2.x model / Stable Diffusion 2.xのモデルを読み込む')
+  parser.add_argument("--sd_model", type=str, default=None,
+                      help="Stable Diffusion model to load: ckpt or safetensors file / 読み込むSDのモデル、ckptまたはsafetensors")
+  parser.add_argument("--model", type=str, default=None,
+                      help="LoRA model to interrogate: ckpt or safetensors file / 調査するLoRAモデル、ckptまたはsafetensors")
+  parser.add_argument("--batch_size", type=int, default=16,
+                      help="batch size for processing with Text Encoder / Text Encoderで処理するときのバッチサイズ")
+  parser.add_argument("--clip_skip", type=int, default=None,
+                      help="use output of nth layer from back of text encoder (n>=1) / text encoderの後ろからn番目の層の出力を用いる（nは1以上）")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args = parser.parse_args()
+  interrogate(args)

lpw_stable_diffusion.py ADDED Viewed

	@@ -0,0 +1,1233 @@

+# copy from https://github.com/huggingface/diffusers/blob/main/examples/community/lpw_stable_diffusion.py
+# and modify to support SD2.x
+import inspect
+import re
+from typing import Callable, List, Optional, Union
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+import diffusers
+from diffusers import SchedulerMixin, StableDiffusionPipeline
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.utils import logging
+try:
+    from diffusers.utils import PIL_INTERPOLATION
+except ImportError:
+    if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.Resampling.BILINEAR,
+            "bilinear": PIL.Image.Resampling.BILINEAR,
+            "bicubic": PIL.Image.Resampling.BICUBIC,
+            "lanczos": PIL.Image.Resampling.LANCZOS,
+            "nearest": PIL.Image.Resampling.NEAREST,
+        }
+    else:
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
+            "nearest": PIL.Image.NEAREST,
+        }
+# ------------------------------------------------------------------------------
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+    return tokens, weights
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+    return tokens, weights
+def get_unweighted_text_embeddings(
+    pipe: StableDiffusionPipeline,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    clip_skip: int,
+    eos: int,
+    pad: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            if pad == eos:  # v1
+                text_input_chunk[:, -1] = text_input[0, -1]
+            else:  # v2
+                for j in range(len(text_input_chunk)):
+                    if text_input_chunk[j, -1] != eos and text_input_chunk[j, -1] != pad:  # 最後に普通の文字がある
+                        text_input_chunk[j, -1] = eos
+                    if text_input_chunk[j, 1] == pad:  # BOSだけであとはPAD
+                        text_input_chunk[j, 1] = eos
+            if clip_skip is None or clip_skip == 1:
+                text_embedding = pipe.text_encoder(text_input_chunk)[0]
+            else:
+                enc_out = pipe.text_encoder(text_input_chunk, output_hidden_states=True, return_dict=True)
+                text_embedding = enc_out["hidden_states"][-clip_skip]
+                text_embedding = pipe.text_encoder.text_model.final_layer_norm(text_embedding)
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+            text_embeddings.append(text_embedding)
+        text_embeddings = torch.concat(text_embeddings, axis=1)
+    else:
+        if clip_skip is None or clip_skip == 1:
+            text_embeddings = pipe.text_encoder(text_input)[0]
+        else:
+            enc_out = pipe.text_encoder(text_input, output_hidden_states=True, return_dict=True)
+            text_embeddings = enc_out["hidden_states"][-clip_skip]
+            text_embeddings = pipe.text_encoder.text_model.final_layer_norm(text_embeddings)
+    return text_embeddings
+def get_weighted_text_embeddings(
+    pipe: StableDiffusionPipeline,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 3,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    clip_skip=None,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+    Args:
+        pipe (`StableDiffusionPipeline`):
+            Pipe to provide access to the tokenizer and the text encoder.
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        uncond_prompt (`str` or `List[str]`):
+            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
+            is provided, the embeddings of prompt and uncond_prompt are concatenated.
+        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
+    else:
+        prompt_tokens = [token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1] for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    # pad the length of tokens and weights
+    bos = pipe.tokenizer.bos_token_id
+    eos = pipe.tokenizer.eos_token_id
+    pad = pipe.tokenizer.pad_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
+        uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe,
+        prompt_tokens,
+        pipe.tokenizer.model_max_length,
+        clip_skip,
+        eos,
+        pad,
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe,
+            uncond_tokens,
+            pipe.tokenizer.model_max_length,
+            clip_skip,
+            eos,
+            pad,
+            no_boseos_middle=no_boseos_middle,
+        )
+        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        text_embeddings *= prompt_weights.unsqueeze(-1)
+        current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+        if uncond_prompt is not None:
+            previous_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            uncond_embeddings *= uncond_weights.unsqueeze(-1)
+            current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+    return text_embeddings, None
+def preprocess_image(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+def preprocess_mask(mask, scale_factor=8):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    mask = torch.from_numpy(mask)
+    return mask
+def prepare_controlnet_image(
+    image: PIL.Image.Image,
+    width: int,
+    height: int,
+    batch_size: int,
+    num_images_per_prompt: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    do_classifier_free_guidance: bool = False,
+    guess_mode: bool = False,
+):
+    if not isinstance(image, torch.Tensor):
+        if isinstance(image, PIL.Image.Image):
+            image = [image]
+        if isinstance(image[0], PIL.Image.Image):
+            images = []
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
+                image_ = np.array(image_)
+                image_ = image_[None, :]
+                images.append(image_)
+            image = images
+            image = np.concatenate(image, axis=0)
+            image = np.array(image).astype(np.float32) / 255.0
+            image = image.transpose(0, 3, 1, 2)
+            image = torch.from_numpy(image)
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, dim=0)
+    image_batch_size = image.shape[0]
+    if image_batch_size == 1:
+        repeat_by = batch_size
+    else:
+        # image batch size is the same as prompt batch size
+        repeat_by = num_images_per_prompt
+    image = image.repeat_interleave(repeat_by, dim=0)
+    image = image.to(device=device, dtype=dtype)
+    if do_classifier_free_guidance and not guess_mode:
+        image = torch.cat([image] * 2)
+    return image
+class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
+    weighting in prompt.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    # if version.parse(version.parse(diffusers.__version__).base_version) >= version.parse("0.9.0"):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: SchedulerMixin,
+        # clip_skip: int,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        clip_skip: int = 1,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker,
+            image_encoder=image_encoder,
+        )
+        self.custom_clip_skip = clip_skip
+        self.__init__additional__()
+    def __init__additional__(self):
+        if not hasattr(self, "vae_scale_factor"):
+            setattr(self, "vae_scale_factor", 2 ** (len(self.vae.config.block_out_channels) - 1))
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+        max_embeddings_multiples,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+            clip_skip=self.custom_clip_skip,
+        )
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            bs_embed, seq_len, _ = uncond_embeddings.shape
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def check_inputs(self, prompt, height, width, strength, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if height % 8 != 0 or width % 8 != 0:
+            logger.info(f'{height} {width}')
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}."
+            )
+    def get_timesteps(self, num_inference_steps, strength, device, is_text2img):
+        if is_text2img:
+            return self.scheduler.timesteps.to(device), num_inference_steps
+        else:
+            # get the original timestep using init_timestep
+            offset = self.scheduler.config.get("steps_offset", 0)
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep + offset, 0)
+            timesteps = self.scheduler.timesteps[t_start:].to(device)
+            return timesteps, num_inference_steps - t_start
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values.to(dtype))
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, device, generator, latents=None):
+        if image is None:
+            shape = (
+                batch_size,
+                self.unet.in_channels,
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor,
+            )
+            if latents is None:
+                if device.type == "mps":
+                    # randn does not work reproducibly on mps
+                    latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+                else:
+                    latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                if latents.shape != shape:
+                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+                latents = latents.to(device)
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            return latents, None, None
+        else:
+            init_latent_dist = self.vae.encode(image).latent_dist
+            init_latents = init_latent_dist.sample(generator=generator)
+            init_latents = 0.18215 * init_latents
+            init_latents = torch.cat([init_latents] * batch_size, dim=0)
+            init_latents_orig = init_latents
+            shape = init_latents.shape
+            # add noise to latents using the timesteps
+            if device.type == "mps":
+                noise = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep)
+            return latents, init_latents_orig, noise
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        controlnet=None,
+        controlnet_image=None,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            controlnet (`diffusers.ControlNetModel`, *optional*):
+                A controlnet model to be used for the inference. If not provided, controlnet will be disabled.
+            controlnet_image (`torch.FloatTensor` or `PIL.Image.Image`, *optional*):
+                `Image`, or tensor representing an image batch, to be used as the starting point for the controlnet
+                inference.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if controlnet is not None and controlnet_image is None:
+            raise ValueError("controlnet_image must be provided if controlnet is not None.")
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, strength, callback_steps)
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            max_embeddings_multiples,
+        )
+        dtype = text_embeddings.dtype
+        # 4. Preprocess image and mask
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess_image(image)
+        if image is not None:
+            image = image.to(device=self.device, dtype=dtype)
+        if isinstance(mask_image, PIL.Image.Image):
+            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
+        if mask_image is not None:
+            mask = mask_image.to(device=self.device, dtype=dtype)
+            mask = torch.cat([mask] * batch_size * num_images_per_prompt)
+        else:
+            mask = None
+        if controlnet_image is not None:
+            controlnet_image = prepare_controlnet_image(
+                controlnet_image, width, height, batch_size, 1, self.device, controlnet.dtype, do_classifier_free_guidance, False
+            )
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device, image is None)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # 6. Prepare latent variables
+        latents, init_latents_orig, noise = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            unet_additional_args = {}
+            if controlnet is not None:
+                down_block_res_samples, mid_block_res_sample = controlnet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=text_embeddings,
+                    controlnet_cond=controlnet_image,
+                    conditioning_scale=1.0,
+                    guess_mode=False,
+                    return_dict=False,
+                )
+                unet_additional_args["down_block_additional_residuals"] = down_block_res_samples
+                unet_additional_args["mid_block_additional_residual"] = mid_block_res_sample
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, **unet_additional_args).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            if mask is not None:
+                # masking
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+            # call the callback, if provided
+            if i % callback_steps == 0:
+                if callback is not None:
+                    callback(i, t, latents)
+                if is_cancelled_callback is not None and is_cancelled_callback():
+                    return None
+        return latents
+    def latents_to_image(self, latents):
+        # 9. Post-processing
+        image = self.decode_latents(latents.to(self.vae.dtype))
+        image = self.numpy_to_pil(image)
+        return image
+    def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function for text-to-image generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+        )
+    def img2img(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function for image-to-image generation.
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+        )
+    def inpaint(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function for inpaint.
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
+                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            mask_image=mask_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+        )

main.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+extract factors the build is dependent on:
+[X] compute capability
+    [ ] TODO: Q - What if we have multiple GPUs of different makes?
+- CUDA version
+- Software:
+    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiple)
+    - CuBLAS-LT: full-build 8-bit optimizer
+    - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
+evaluation:
+    - if paths faulty, return meaningful error
+    - else:
+        - determine CUDA version
+        - determine capabilities
+        - based on that set the default path
+"""
+import ctypes
+from .paths import determine_cuda_runtime_lib_path
+def check_cuda_result(cuda, result_val):
+    # 3. Check for CUDA errors
+    if result_val != 0:
+        error_str = ctypes.c_char_p()
+        cuda.cuGetErrorString(result_val, ctypes.byref(error_str))
+        print(f"CUDA exception! Error code: {error_str.value.decode()}")
+def get_cuda_version(cuda, cudart_path):
+    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION
+    try:
+        cudart = ctypes.CDLL(cudart_path)
+    except OSError:
+        # TODO: shouldn't we error or at least warn here?
+        print(f'ERROR: libcudart.so could not be read from path: {cudart_path}!')
+        return None
+    version = ctypes.c_int()
+    check_cuda_result(cuda, cudart.cudaRuntimeGetVersion(ctypes.byref(version)))
+    version = int(version.value)
+    major = version//1000
+    minor = (version-(major*1000))//10
+    if major < 11:
+       print('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
+    return f'{major}{minor}'
+def get_cuda_lib_handle():
+    # 1. find libcuda.so library (GPU driver) (/usr/lib)
+    try:
+        cuda = ctypes.CDLL("libcuda.so")
+    except OSError:
+        # TODO: shouldn't we error or at least warn here?
+        print('CUDA SETUP: WARNING! libcuda.so not found! Do you have a CUDA driver installed? If you are on a cluster, make sure you are on a CUDA machine!')
+        return None
+    check_cuda_result(cuda, cuda.cuInit(0))
+    return cuda
+def get_compute_capabilities(cuda):
+    """
+    1. find libcuda.so library (GPU driver) (/usr/lib)
+       init_device -> init variables -> call function by reference
+    2. call extern C function to determine CC
+       (https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html)
+    3. Check for CUDA errors
+       https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
+    # bits taken from https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549
+    """
+    nGpus = ctypes.c_int()
+    cc_major = ctypes.c_int()
+    cc_minor = ctypes.c_int()
+    device = ctypes.c_int()
+    check_cuda_result(cuda, cuda.cuDeviceGetCount(ctypes.byref(nGpus)))
+    ccs = []
+    for i in range(nGpus.value):
+        check_cuda_result(cuda, cuda.cuDeviceGet(ctypes.byref(device), i))
+        ref_major = ctypes.byref(cc_major)
+        ref_minor = ctypes.byref(cc_minor)
+        # 2. call extern C function to determine CC
+        check_cuda_result(
+            cuda, cuda.cuDeviceComputeCapability(ref_major, ref_minor, device)
+        )
+        ccs.append(f"{cc_major.value}.{cc_minor.value}")
+    return ccs
+# def get_compute_capability()-> Union[List[str, ...], None]: # FIXME: error
+def get_compute_capability(cuda):
+    """
+    Extracts the highest compute capbility from all available GPUs, as compute
+    capabilities are downwards compatible. If no GPUs are detected, it returns
+    None.
+    """
+    ccs = get_compute_capabilities(cuda)
+    if ccs is not None:
+        # TODO: handle different compute capabilities; for now, take the max
+        return ccs[-1]
+    return None
+def evaluate_cuda_setup():
+    print('')
+    print('='*35 + 'BUG REPORT' + '='*35)
+    print('Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')
+    print('For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link')
+    print('='*80)
+    return "libbitsandbytes_cuda116.dll"            # $$$
+    binary_name = "libbitsandbytes_cpu.so"
+    #if not torch.cuda.is_available():
+        #print('No GPU detected. Loading CPU library...')
+        #return binary_name
+    cudart_path = determine_cuda_runtime_lib_path()
+    if cudart_path is None:
+        print(
+            "WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!"
+        )
+        return binary_name
+    print(f"CUDA SETUP: CUDA runtime path found: {cudart_path}")
+    cuda = get_cuda_lib_handle()
+    cc = get_compute_capability(cuda)
+    print(f"CUDA SETUP: Highest compute capability among GPUs detected: {cc}")
+    cuda_version_string = get_cuda_version(cuda, cudart_path)
+    if cc == '':
+        print(
+            "WARNING: No GPU detected! Check your CUDA paths. Processing to load CPU-only library..."
+        )
+        return binary_name
+    # 7.5 is the minimum CC vor cublaslt
+    has_cublaslt = cc in ["7.5", "8.0", "8.6"]
+    # TODO:
+    # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
+    # (2) Multiple CUDA versions installed
+    # we use ls -l instead of nvcc to determine the cuda version
+    # since most installations will have the libcudart.so installed, but not the compiler
+    print(f'CUDA SETUP: Detected CUDA version {cuda_version_string}')
+    def get_binary_name():
+        "if not has_cublaslt (CC < 7.5), then we have to choose  _nocublaslt.so"
+        bin_base_name = "libbitsandbytes_cuda"
+        if has_cublaslt:
+            return f"{bin_base_name}{cuda_version_string}.so"
+        else:
+            return f"{bin_base_name}{cuda_version_string}_nocublaslt.so"
+    binary_name = get_binary_name()
+    return binary_name

make_captions.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import argparse
+import glob
+import os
+import json
+import random
+import sys
+from pathlib import Path
+from PIL import Image
+from tqdm import tqdm
+import numpy as np
+import torch
+from library.device_utils import init_ipex, get_preferred_device
+init_ipex()
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+sys.path.append(os.path.dirname(__file__))
+from blip.blip import blip_decoder, is_url
+import library.train_util as train_util
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+DEVICE = get_preferred_device()
+IMAGE_SIZE = 384
+# 正方形でいいのか？　という気がするがソースがそうなので
+IMAGE_TRANSFORM = transforms.Compose(
+    [
+        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE), interpolation=InterpolationMode.BICUBIC),
+        transforms.ToTensor(),
+        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ]
+)
+# 共通化したいが微妙に処理が異なる……
+class ImageLoadingTransformDataset(torch.utils.data.Dataset):
+    def __init__(self, image_paths):
+        self.images = image_paths
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        img_path = self.images[idx]
+        try:
+            image = Image.open(img_path).convert("RGB")
+            # convert to tensor temporarily so dataloader will accept it
+            tensor = IMAGE_TRANSFORM(image)
+        except Exception as e:
+            logger.error(f"Could not load image path / 画像を読み込めません: {img_path}, error: {e}")
+            return None
+        return (tensor, img_path)
+def collate_fn_remove_corrupted(batch):
+    """Collate function that allows to remove corrupted examples in the
+    dataloader. It expects that the dataloader returns 'None' when that occurs.
+    The 'None's in the batch are removed.
+    """
+    # Filter out all the Nones (corrupted examples)
+    batch = list(filter(lambda x: x is not None, batch))
+    return batch
+def main(args):
+    # fix the seed for reproducibility
+    seed = args.seed  # + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    if not os.path.exists("blip"):
+        args.train_data_dir = os.path.abspath(args.train_data_dir)  # convert to absolute path
+        cwd = os.getcwd()
+        logger.info(f"Current Working Directory is: {cwd}")
+        os.chdir("finetune")
+        if not is_url(args.caption_weights) and not os.path.isfile(args.caption_weights):
+            args.caption_weights = os.path.join("..", args.caption_weights)
+    logger.info(f"load images from {args.train_data_dir}")
+    train_data_dir_path = Path(args.train_data_dir)
+    image_paths = train_util.glob_images_pathlib(train_data_dir_path, args.recursive)
+    logger.info(f"found {len(image_paths)} images.")
+    logger.info(f"loading BLIP caption: {args.caption_weights}")
+    model = blip_decoder(pretrained=args.caption_weights, image_size=IMAGE_SIZE, vit="large", med_config="./blip/med_config.json")
+    model.eval()
+    model = model.to(DEVICE)
+    logger.info("BLIP loaded")
+    # captioningする
+    def run_batch(path_imgs):
+        imgs = torch.stack([im for _, im in path_imgs]).to(DEVICE)
+        with torch.no_grad():
+            if args.beam_search:
+                captions = model.generate(
+                    imgs, sample=False, num_beams=args.num_beams, max_length=args.max_length, min_length=args.min_length
+                )
+            else:
+                captions = model.generate(
+                    imgs, sample=True, top_p=args.top_p, max_length=args.max_length, min_length=args.min_length
+                )
+        for (image_path, _), caption in zip(path_imgs, captions):
+            with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding="utf-8") as f:
+                f.write(caption + "\n")
+                if args.debug:
+                    logger.info(f'{image_path} {caption}')
+    # 読み込みの高速化のためにDataLoaderを使うオプション
+    if args.max_data_loader_n_workers is not None:
+        dataset = ImageLoadingTransformDataset(image_paths)
+        data = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=args.batch_size,
+            shuffle=False,
+            num_workers=args.max_data_loader_n_workers,
+            collate_fn=collate_fn_remove_corrupted,
+            drop_last=False,
+        )
+    else:
+        data = [[(None, ip)] for ip in image_paths]
+    b_imgs = []
+    for data_entry in tqdm(data, smoothing=0.0):
+        for data in data_entry:
+            if data is None:
+                continue
+            img_tensor, image_path = data
+            if img_tensor is None:
+                try:
+                    raw_image = Image.open(image_path)
+                    if raw_image.mode != "RGB":
+                        raw_image = raw_image.convert("RGB")
+                    img_tensor = IMAGE_TRANSFORM(raw_image)
+                except Exception as e:
+                    logger.error(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}")
+                    continue
+            b_imgs.append((image_path, img_tensor))
+            if len(b_imgs) >= args.batch_size:
+                run_batch(b_imgs)
+                b_imgs.clear()
+    if len(b_imgs) > 0:
+        run_batch(b_imgs)
+    logger.info("done!")
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+    parser.add_argument(
+        "--caption_weights",
+        type=str,
+        default="https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth",
+        help="BLIP caption weights (model_large_caption.pth) / BLIP captionの重みファイル(model_large_caption.pth)",
+    )
+    parser.add_argument(
+        "--caption_extention",
+        type=str,
+        default=None,
+        help="extension of caption file (for backward compatibility) / 出力されるキャプションファイルの拡張子（スペルミスしていたのを残してあります）",
+    )
+    parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 出力されるキャプションファイルの拡張子")
+    parser.add_argument(
+        "--beam_search",
+        action="store_true",
+        help="use beam search (default Nucleus sampling) / beam searchを使う（このオプション未指定時はNucleus sampling）",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+    parser.add_argument(
+        "--max_data_loader_n_workers",
+        type=int,
+        default=None,
+        help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する（読み込みを高速化）",
+    )
+    parser.add_argument("--num_beams", type=int, default=1, help="num of beams in beam search /beam search時のビーム数（多いと精度が上がるが時間がかかる）")
+    parser.add_argument("--top_p", type=float, default=0.9, help="top_p in Nucleus sampling / Nucleus sampling時のtop_p")
+    parser.add_argument("--max_length", type=int, default=75, help="max length of caption / captionの最大長")
+    parser.add_argument("--min_length", type=int, default=5, help="min length of caption / captionの最小長")
+    parser.add_argument("--seed", default=42, type=int, help="seed for reproducibility / 再現性を確保するための乱数seed")
+    parser.add_argument("--debug", action="store_true", help="debug mode")
+    parser.add_argument("--recursive", action="store_true", help="search for images in subfolders recursively / サブフォルダを再帰的に検索する")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    # スペルミスしていたオプションを復元する
+    if args.caption_extention is not None:
+        args.caption_extension = args.caption_extention
+    main(args)

make_captions_by_git.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import argparse
+import os
+import re
+from pathlib import Path
+from PIL import Image
+from tqdm import tqdm
+import torch
+from library.device_utils import init_ipex, get_preferred_device
+init_ipex()
+from transformers import AutoProcessor, AutoModelForCausalLM
+from transformers.generation.utils import GenerationMixin
+import library.train_util as train_util
+from library.utils import setup_logging
+setup_logging()
+import logging
+logger = logging.getLogger(__name__)
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PATTERN_REPLACE = [
+    re.compile(r'(has|with|and) the (words?|letters?|name) (" ?[^"]*"|\w+)( ?(is )?(on|in) (the |her |their |him )?\w+)?'),
+    re.compile(r'(with a sign )?that says ?(" ?[^"]*"|\w+)( ?on it)?'),
+    re.compile(r"(with a sign )?that says ?(' ?(i'm)?[^']*'|\w+)( ?on it)?"),
+    re.compile(r"with the number \d+ on (it|\w+ \w+)"),
+    re.compile(r'with the words "'),
+    re.compile(r"word \w+ on it"),
+    re.compile(r"that says the word \w+ on it"),
+    re.compile("that says'the word \"( on it)?"),
+]
+# 誤検知しまくりの with the word xxxx を消す
+def remove_words(captions, debug):
+    removed_caps = []
+    for caption in captions:
+        cap = caption
+        for pat in PATTERN_REPLACE:
+            cap = pat.sub("", cap)
+        if debug and cap != caption:
+            logger.info(caption)
+            logger.info(cap)
+        removed_caps.append(cap)
+    return removed_caps
+def collate_fn_remove_corrupted(batch):
+    """Collate function that allows to remove corrupted examples in the
+    dataloader. It expects that the dataloader returns 'None' when that occurs.
+    The 'None's in the batch are removed.
+    """
+    # Filter out all the Nones (corrupted examples)
+    batch = list(filter(lambda x: x is not None, batch))
+    return batch
+def main(args):
+    r"""
+    transformers 4.30.2で、バッチサイズ>1でも動くようになったので、以下コメントアウト
+    # GITにバッチサイズが1より大きくても動くようにパッチを当てる: transformers 4.26.0用
+    org_prepare_input_ids_for_generation = GenerationMixin._prepare_input_ids_for_generation
+    curr_batch_size = [args.batch_size]  # ループの最後で件数がbatch_size未満になるので入れ替えられるように
+    # input_idsがバッチサイズと同じ件数である必要がある：バッチサイズはこの関数から参照できないので外から渡す
+    # ここより上で置き換えようとするとすごく大変
+    def _prepare_input_ids_for_generation_patch(self, bos_token_id, encoder_outputs):
+        input_ids = org_prepare_input_ids_for_generation(self, bos_token_id, encoder_outputs)
+        if input_ids.size()[0] != curr_batch_size[0]:
+            input_ids = input_ids.repeat(curr_batch_size[0], 1)
+        return input_ids
+    GenerationMixin._prepare_input_ids_for_generation = _prepare_input_ids_for_generation_patch
+    """
+    logger.info(f"load images from {args.train_data_dir}")
+    train_data_dir_path = Path(args.train_data_dir)
+    image_paths = train_util.glob_images_pathlib(train_data_dir_path, args.recursive)
+    logger.info(f"found {len(image_paths)} images.")
+    # できればcacheに依存せず明示的にダウンロードしたい
+    logger.info(f"loading GIT: {args.model_id}")
+    git_processor = AutoProcessor.from_pretrained(args.model_id)
+    git_model = AutoModelForCausalLM.from_pretrained(args.model_id).to(DEVICE)
+    logger.info("GIT loaded")
+    # captioningする
+    def run_batch(path_imgs):
+        imgs = [im for _, im in path_imgs]
+        # curr_batch_size[0] = len(path_imgs)
+        inputs = git_processor(images=imgs, return_tensors="pt").to(DEVICE)  # 画像はpil形式
+        generated_ids = git_model.generate(pixel_values=inputs.pixel_values, max_length=args.max_length)
+        captions = git_processor.batch_decode(generated_ids, skip_special_tokens=True)
+        if args.remove_words:
+            captions = remove_words(captions, args.debug)
+        for (image_path, _), caption in zip(path_imgs, captions):
+            with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding="utf-8") as f:
+                f.write(caption + "\n")
+                if args.debug:
+                    logger.info(f"{image_path} {caption}")
+    # 読み込みの高速化のためにDataLoaderを使うオプション
+    if args.max_data_loader_n_workers is not None:
+        dataset = train_util.ImageLoadingDataset(image_paths)
+        data = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=args.batch_size,
+            shuffle=False,
+            num_workers=args.max_data_loader_n_workers,
+            collate_fn=collate_fn_remove_corrupted,
+            drop_last=False,
+        )
+    else:
+        data = [[(None, ip)] for ip in image_paths]
+    b_imgs = []
+    for data_entry in tqdm(data, smoothing=0.0):
+        for data in data_entry:
+            if data is None:
+                continue
+            image, image_path = data
+            if image is None:
+                try:
+                    image = Image.open(image_path)
+                    if image.mode != "RGB":
+                        image = image.convert("RGB")
+                except Exception as e:
+                    logger.error(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}")
+                    continue
+            b_imgs.append((image_path, image))
+            if len(b_imgs) >= args.batch_size:
+                run_batch(b_imgs)
+                b_imgs.clear()
+    if len(b_imgs) > 0:
+        run_batch(b_imgs)
+    logger.info("done!")
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+    parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 出力されるキャプションファイルの拡張子")
+    parser.add_argument(
+        "--model_id",
+        type=str,
+        default="microsoft/git-large-textcaps",
+        help="model id for GIT in Hugging Face / 使用するGITのHugging FaceのモデルID",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+    parser.add_argument(
+        "--max_data_loader_n_workers",
+        type=int,
+        default=None,
+        help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する（読み込みを高速化）",
+    )
+    parser.add_argument("--max_length", type=int, default=50, help="max length of caption / captionの最大長")
+    parser.add_argument(
+        "--remove_words",
+        action="store_true",
+        help="remove like `with the words xxx` from caption / `with the words xxx`のような部分をキャプションから削除する",
+    )
+    parser.add_argument("--debug", action="store_true", help="debug mode")
+    parser.add_argument("--recursive", action="store_true", help="search for images in subfolders recursively / サブフォルダを再帰的に検索する")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    main(args)

masked_loss_README-ja.md ADDED Viewed

	@@ -0,0 +1,57 @@

+## マスクロスについて
+マスクロスは、入力画像のマスクで指定された部分だけ損失計算することで、画像の一部分だけを学習することができる機能です。
+たとえばキャラクタを学習したい場合、キャラクタ部分だけをマスクして学習することで、背景を無視して学習することができます。
+マスクロスのマスクには、二種類の指定方法があります。
+- マスク画像を用いる方法
+- 透明度（アルファチャネル）を使用する方法
+なお、サンプルは [ずんずんPJイラスト/3Dデータ](https://zunko.jp/con_illust.html) の「AI画像モデル用学習データ」を使用しています。
+### マスク画像を用いる方法
+学習画像それぞれに対応するマスク画像を用意する方法です。学習画像と同じファイル名のマスク画像を用意し、それを学習画像と別のディレクトリに保存します。
+- 学習画像
+  ![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/607c5116-5f62-47de-8b66-9c4a597f0441)
+- マスク画像
+  ![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/53e9b0f8-a4bf-49ed-882d-4026f84e8450)
+```.toml
+[[datasets.subsets]]
+image_dir = "/path/to/a_zundamon"
+caption_extension = ".txt"
+conditioning_data_dir = "/path/to/a_zundamon_mask"
+num_repeats = 8
+```
+マスク画像は、学習画像と同じサイズで、学習する部分を白、無視する部分を黒で描画します。グレースケールにも対応しています（127 ならロス重みが 0.5 になります）。なお、正確にはマスク画像の R チャネルが用いられます。
+DreamBooth 方式の dataset で、`conditioning_data_dir` で指定したディレクトリにマスク画像を保存してください。ControlNet のデータセットと同じですので、詳細は [ControlNet-LLLite](train_lllite_README-ja.md#データセットの準備) を参照してください。
+### 透明度（アルファチャネル）を使用する方法
+学習画像の透明度（アルファチャネル）がマスクとして使用されます。透明度が 0 の部分は無視され、255 の部分は学習されます。半透明の場合は、その透明度に応じてロス重みが変化します（127 ならおおむね 0.5）。
+![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/0baa129b-446a-4aac-b98c-7208efb0e75e)
+※それぞれの画像は透過PNG
+学習時のスクリプトのオプションに `--alpha_mask` を指定するか、dataset の設定ファイルの subset で、`alpha_mask` を指定してください。たとえば、以下のようになります。
+```toml
+[[datasets.subsets]]
+image_dir = "/path/to/image/dir"
+caption_extension = ".txt"
+num_repeats = 8
+alpha_mask = true
+```
+## 学習時の注意事項
+- 現時点では DreamBooth 方式の dataset のみ対応しています。
+- マスクは latents のサイズ、つまり 1/8 に縮小されてから適用されます。そのため、細かい部分（たとえばアホ毛やイヤリングなど）はうまく学習できない可能性があります。マスクをわずかに拡張するなどの工夫が必要かもしれません。
+- マスクロスを用いる場合、学習対象外の部分をキャプションに含める必要はないかもしれません。（要検証）
+- `alpha_mask` の場合、マスクの有無を切り替えると latents キャッシュが自動的に再生成されます。

masked_loss_README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+## Masked Loss
+Masked loss is a feature that allows you to train only part of an image by calculating the loss only for the part specified by the mask of the input image. For example, if you want to train a character, you can train only the character part by masking it, ignoring the background.
+There are two ways to specify the mask for masked loss.
+- Using a mask image
+- Using transparency (alpha channel) of the image
+The sample uses the "AI image model training data" from [ZunZunPJ Illustration/3D Data](https://zunko.jp/con_illust.html).
+### Using a mask image
+This is a method of preparing a mask image corresponding to each training image. Prepare a mask image with the same file name as the training image and save it in a different directory from the training image.
+- Training image
+  ![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/607c5116-5f62-47de-8b66-9c4a597f0441)
+- Mask image
+  ![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/53e9b0f8-a4bf-49ed-882d-4026f84e8450)
+```.toml
+[[datasets.subsets]]
+image_dir = "/path/to/a_zundamon"
+caption_extension = ".txt"
+conditioning_data_dir = "/path/to/a_zundamon_mask"
+num_repeats = 8
+```
+The mask image is the same size as the training image, with the part to be trained drawn in white and the part to be ignored in black. It also supports grayscale (127 gives a loss weight of 0.5). The R channel of the mask image is used currently.
+Use the dataset in the DreamBooth method, and save the mask image in the directory specified by `conditioning_data_dir`. It is the same as the ControlNet dataset, so please refer to [ControlNet-LLLite](train_lllite_README.md#Preparing-the-dataset) for details.
+### Using transparency (alpha channel) of the image
+The transparency (alpha channel) of the training image is used as a mask. The part with transparency 0 is ignored, the part with transparency 255 is trained. For semi-transparent parts, the loss weight changes according to the transparency (127 gives a weight of about 0.5).
+![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/0baa129b-446a-4aac-b98c-7208efb0e75e)
+※Each image is a transparent PNG
+Specify `--alpha_mask` in the training script options or specify `alpha_mask` in the subset of the dataset configuration file. For example, it will look like this.
+```toml
+[[datasets.subsets]]
+image_dir = "/path/to/image/dir"
+caption_extension = ".txt"
+num_repeats = 8
+alpha_mask = true
+```
+## Notes on training
+- At the moment, only the dataset in the DreamBooth method is supported.
+- The mask is applied after the size is reduced to 1/8, which is the size of the latents. Therefore, fine details (such as ahoge or earrings) may not be learned well. Some dilations of the mask may be necessary.
+- If using masked loss, it may not be necessary to include parts that are not to be trained in the caption. (To be verified)
+- In the case of `alpha_mask`, the latents cache is automatically regenerated when the enable/disable state of the mask is switched.