Spaces:

Aatricks
/

LightDiffusion-Next

Running on Zero

App Files Files Community

LightDiffusion-Next / modules /SD15 /SDClip.py

Aatricks

Upload folder using huggingface_hub

a1693c9 9 months ago

raw

history blame

15.3 kB

	import json
	import logging
	import numbers
	import torch
	from modules.Device import Device
	from modules.cond import cast
	from modules.clip.CLIPTextModel import CLIPTextModel



	def gen_empty_tokens(special_tokens: dict, length: int) -> list:
	"""#### Generate a list of empty tokens.

	#### Args:
	- `special_tokens` (dict): The special tokens.
	- `length` (int): The length of the token list.

	#### Returns:
	- `list`: The list of empty tokens.
	"""
	start_token = special_tokens.get("start", None)
	end_token = special_tokens.get("end", None)
	pad_token = special_tokens.get("pad")
	output = []
	if start_token is not None:
	output.append(start_token)
	if end_token is not None:
	output.append(end_token)
	output += [pad_token] * (length - len(output))
	return output


	class ClipTokenWeightEncoder:
	"""#### Class representing a CLIP token weight encoder."""

	def encode_token_weights(self, token_weight_pairs: list) -> tuple:
	"""#### Encode token weights.

	#### Args:
	- `token_weight_pairs` (list): The token weight pairs.

	#### Returns:
	- `tuple`: The encoded tokens and the pooled output.
	"""
	to_encode = list()
	max_token_len = 0
	has_weights = False
	for x in token_weight_pairs:
	tokens = list(map(lambda a: a[0], x))
	max_token_len = max(len(tokens), max_token_len)
	has_weights = has_weights or not all(map(lambda a: a[1] == 1.0, x))
	to_encode.append(tokens)

	sections = len(to_encode)
	if has_weights or sections == 0:
	to_encode.append(gen_empty_tokens(self.special_tokens, max_token_len))

	o = self.encode(to_encode)
	out, pooled = o[:2]

	if pooled is not None:
	first_pooled = pooled[0:1].to(Device.intermediate_device())
	else:
	first_pooled = pooled

	output = []
	for k in range(0, sections):
	z = out[k : k + 1]
	if has_weights:
	z_empty = out[-1]
	for i in range(len(z)):
	for j in range(len(z[i])):
	weight = token_weight_pairs[k][j][1]
	if weight != 1.0:
	z[i][j] = (z[i][j] - z_empty[j]) * weight + z_empty[j]
	output.append(z)

	if len(output) == 0:
	r = (out[-1:].to(Device.intermediate_device()), first_pooled)
	else:
	r = (torch.cat(output, dim=-2).to(Device.intermediate_device()), first_pooled)

	if len(o) > 2:
	extra = {}
	for k in o[2]:
	v = o[2][k]
	if k == "attention_mask":
	v = (
	v[:sections]
	.flatten()
	.unsqueeze(dim=0)
	.to(Device.intermediate_device())
	)
	extra[k] = v

	r = r + (extra,)
	return r

	class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
	"""#### Uses the CLIP transformer encoder for text (from huggingface)."""

	LAYERS = ["last", "pooled", "hidden"]

	def __init__(
	self,
	version: str = "openai/clip-vit-large-patch14",
	device: str = "cpu",
	max_length: int = 77,
	freeze: bool = True,
	layer: str = "last",
	layer_idx: int = None,
	textmodel_json_config: str = None,
	dtype: torch.dtype = None,
	model_class: type = CLIPTextModel,
	special_tokens: dict = {"start": 49406, "end": 49407, "pad": 49407},
	layer_norm_hidden_state: bool = True,
	enable_attention_masks: bool = False,
	zero_out_masked:bool = False,
	return_projected_pooled: bool = True,
	return_attention_masks: bool = False,
	model_options={},
	):
	"""#### Initialize the SDClipModel.

	#### Args:
	- `version` (str, optional): The version of the model. Defaults to "openai/clip-vit-large-patch14".
	- `device` (str, optional): The device to use. Defaults to "cpu".
	- `max_length` (int, optional): The maximum length of the input. Defaults to 77.
	- `freeze` (bool, optional): Whether to freeze the model parameters. Defaults to True.
	- `layer` (str, optional): The layer to use. Defaults to "last".
	- `layer_idx` (int, optional): The index of the layer. Defaults to None.
	- `textmodel_json_config` (str, optional): The path to the JSON config file. Defaults to None.
	- `dtype` (torch.dtype, optional): The data type. Defaults to None.
	- `model_class` (type, optional): The model class. Defaults to CLIPTextModel.
	- `special_tokens` (dict, optional): The special tokens. Defaults to {"start": 49406, "end": 49407, "pad": 49407}.
	- `layer_norm_hidden_state` (bool, optional): Whether to normalize the hidden state. Defaults to True.
	- `enable_attention_masks` (bool, optional): Whether to enable attention masks. Defaults to False.
	- `zero_out_masked` (bool, optional): Whether to zero out masked tokens. Defaults to False.
	- `return_projected_pooled` (bool, optional): Whether to return the projected pooled output. Defaults to True.
	- `return_attention_masks` (bool, optional): Whether to return the attention masks. Defaults to False.
	- `model_options` (dict, optional): Additional model options. Defaults to {}.
	"""
	super().__init__()
	assert layer in self.LAYERS

	if textmodel_json_config is None:
	textmodel_json_config = "./_internal/clip/sd1_clip_config.json"

	with open(textmodel_json_config) as f:
	config = json.load(f)

	operations = model_options.get("custom_operations", None)
	if operations is None:
	operations = cast.manual_cast

	self.operations = operations
	self.transformer = model_class(config, dtype, device, self.operations)
	self.num_layers = self.transformer.num_layers

	self.max_length = max_length
	if freeze:
	self.freeze()
	self.layer = layer
	self.layer_idx = None
	self.special_tokens = special_tokens

	self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055))
	self.enable_attention_masks = enable_attention_masks
	self.zero_out_masked = zero_out_masked

	self.layer_norm_hidden_state = layer_norm_hidden_state
	self.return_projected_pooled = return_projected_pooled
	self.return_attention_masks = return_attention_masks

	if layer == "hidden":
	assert layer_idx is not None
	assert abs(layer_idx) < self.num_layers
	self.set_clip_options({"layer": layer_idx})
	self.options_default = (
	self.layer,
	self.layer_idx,
	self.return_projected_pooled,
	)

	def freeze(self) -> None:
	"""#### Freeze the model parameters."""
	self.transformer = self.transformer.eval()
	for param in self.parameters():
	param.requires_grad = False

	def set_clip_options(self, options: dict) -> None:
	"""#### Set the CLIP options.

	#### Args:
	- `options` (dict): The options to set.
	"""
	layer_idx = options.get("layer", self.layer_idx)
	self.return_projected_pooled = options.get(
	"projected_pooled", self.return_projected_pooled
	)
	if layer_idx is None or abs(layer_idx) > self.num_layers:
	self.layer = "last"
	else:
	self.layer = "hidden"
	self.layer_idx = layer_idx

	def reset_clip_options(self) -> None:
	"""#### Reset the CLIP options to default."""
	self.layer = self.options_default[0]
	self.layer_idx = self.options_default[1]
	self.return_projected_pooled = self.options_default[2]

	def set_up_textual_embeddings(self, tokens: list, current_embeds: torch.nn.Embedding) -> list:
	"""#### Set up the textual embeddings.

	#### Args:
	- `tokens` (list): The input tokens.
	- `current_embeds` (torch.nn.Embedding): The current embeddings.

	#### Returns:
	- `list`: The processed tokens.
	"""
	out_tokens = []
	next_new_token = token_dict_size = current_embeds.weight.shape[0]
	embedding_weights = []

	for x in tokens:
	tokens_temp = []
	for y in x:
	if isinstance(y, numbers.Integral):
	tokens_temp += [int(y)]
	else:
	if y.shape[0] == current_embeds.weight.shape[1]:
	embedding_weights += [y]
	tokens_temp += [next_new_token]
	next_new_token += 1
	else:
	logging.warning(
	"WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {}".format(
	y.shape[0], current_embeds.weight.shape[1]
	)
	)
	while len(tokens_temp) < len(x):
	tokens_temp += [self.special_tokens["pad"]]
	out_tokens += [tokens_temp]

	n = token_dict_size
	if len(embedding_weights) > 0:
	new_embedding = self.operations.Embedding(
	next_new_token + 1,
	current_embeds.weight.shape[1],
	device=current_embeds.weight.device,
	dtype=current_embeds.weight.dtype,
	)
	new_embedding.weight[:token_dict_size] = current_embeds.weight
	for x in embedding_weights:
	new_embedding.weight[n] = x
	n += 1
	self.transformer.set_input_embeddings(new_embedding)

	processed_tokens = []
	for x in out_tokens:
	processed_tokens += [
	list(map(lambda a: n if a == -1 else a, x))
	] # The EOS token should always be the largest one

	return processed_tokens

	def forward(self, tokens: list) -> tuple:
	"""#### Forward pass of the model.

	#### Args:
	- `tokens` (list): The input tokens.

	#### Returns:
	- `tuple`: The output and the pooled output.
	"""
	backup_embeds = self.transformer.get_input_embeddings()
	device = backup_embeds.weight.device
	tokens = self.set_up_textual_embeddings(tokens, backup_embeds)
	tokens = torch.LongTensor(tokens).to(device)

	attention_mask = None
	if (
	self.enable_attention_masks
	or self.zero_out_masked
	or self.return_attention_masks
	):
	attention_mask = torch.zeros_like(tokens)
	end_token = self.special_tokens.get("end", -1)
	for x in range(attention_mask.shape[0]):
	for y in range(attention_mask.shape[1]):
	attention_mask[x, y] = 1
	if tokens[x, y] == end_token:
	break

	attention_mask_model = None
	if self.enable_attention_masks:
	attention_mask_model = attention_mask

	outputs = self.transformer(
	tokens,
	attention_mask_model,
	intermediate_output=self.layer_idx,
	final_layer_norm_intermediate=self.layer_norm_hidden_state,
	dtype=torch.float32,
	)
	self.transformer.set_input_embeddings(backup_embeds)

	if self.layer == "last":
	z = outputs[0].float()
	else:
	z = outputs[1].float()

	if self.zero_out_masked:
	z *= attention_mask.unsqueeze(-1).float()

	pooled_output = None
	if len(outputs) >= 3:
	if (
	not self.return_projected_pooled
	and len(outputs) >= 4
	and outputs[3] is not None
	):
	pooled_output = outputs[3].float()
	elif outputs[2] is not None:
	pooled_output = outputs[2].float()

	extra = {}
	if self.return_attention_masks:
	extra["attention_mask"] = attention_mask

	if len(extra) > 0:
	return z, pooled_output, extra

	return z, pooled_output

	def encode(self, tokens: list) -> tuple:
	"""#### Encode the input tokens.

	#### Args:
	- `tokens` (list): The input tokens.

	#### Returns:
	- `tuple`: The encoded tokens and the pooled output.
	"""
	return self(tokens)

	def load_sd(self, sd: dict) -> None:
	"""#### Load the state dictionary.

	#### Args:
	- `sd` (dict): The state dictionary.
	"""
	return self.transformer.load_state_dict(sd, strict=False)


	class SD1ClipModel(torch.nn.Module):
	"""#### Class representing the SD1ClipModel."""

	def __init__(
	self, device: str = "cpu", dtype: torch.dtype = None, clip_name: str = "l", clip_model: type = SDClipModel, **kwargs
	):
	"""#### Initialize the SD1ClipModel.

	#### Args:
	- `device` (str, optional): The device to use. Defaults to "cpu".
	- `dtype` (torch.dtype, optional): The data type. Defaults to None.
	- `clip_name` (str, optional): The name of the CLIP model. Defaults to "l".
	- `clip_model` (type, optional): The CLIP model class. Defaults to SDClipModel.
	- `**kwargs`: Additional keyword arguments.
	"""
	super().__init__()
	self.clip_name = clip_name
	self.clip = "clip_{}".format(self.clip_name)
	self.lowvram_patch_counter = 0
	self.model_loaded_weight_memory = 0
	setattr(self, self.clip, clip_model(device=device, dtype=dtype, **kwargs))

	def set_clip_options(self, options: dict) -> None:
	"""#### Set the CLIP options.

	#### Args:
	- `options` (dict): The options to set.
	"""
	getattr(self, self.clip).set_clip_options(options)

	def reset_clip_options(self) -> None:
	"""#### Reset the CLIP options to default."""
	getattr(self, self.clip).reset_clip_options()

	def encode_token_weights(self, token_weight_pairs: dict) -> tuple:
	"""#### Encode token weights.

	#### Args:
	- `token_weight_pairs` (dict): The token weight pairs.

	#### Returns:
	- `tuple`: The encoded tokens and the pooled output.
	"""
	token_weight_pairs = token_weight_pairs[self.clip_name]
	out, pooled = getattr(self, self.clip).encode_token_weights(token_weight_pairs)
	return out, pooled