Spaces:

harsh99
/

Virtual-Cloths-TryOn

Running

App Files Files Community

harsh99 commited on May 11

Commit

b993f12

0 Parent(s):

implementation of stable diffusion from scratch

Browse files

Files changed (17) hide show

.gitignore +53 -0
README.md +4 -0
attention.py +117 -0
clip.py +86 -0
ddpm.py +119 -0
decoder.py +134 -0
diffusion.py +297 -0
dog.jpg +0 -0
encoder.py +91 -0
interface.py +151 -0
merges.txt +0 -0
model.py +28 -0
model_converter.py +0 -0
pipeline.py +174 -0
requirements.txt +115 -0
test.ipynb +0 -0
vocab.json +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,53 @@

+*inkpunk-diffusion-v1.ckpt
+# Byte-compiled / optimized / DLL files
+__pycache__/
+**/__pycache__/
+*.py[cod]
+# C extensions
+*.so
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+# Translations
+*.mo
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+# Rope
+.ropeproject
+# Django stuff:
+*.log
+*.pot
+# Sphinx documentation
+docs/_build/

README.md ADDED Viewed

	@@ -0,0 +1,4 @@

+# stable-diffusion
+<!-- 1. Download `vocab.json` and `merges.txt` from https://huggingface.co/CompVis/stable-diffusion-v1-4/tree/main/tokenizer and save them in the `data` folder
+2. Download `inkpunk-diffusion-v1.ckpt` from https://huggingface.co/Envvi/Inkpunk-Diffusion/tree/main and save it in the `data` folder -->

attention.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+import math
+class SelfAttention(nn.Module):
+    def __init__(self, n_heads, d_embed, in_proj_bias=True, out_proj_bias=True):
+        super().__init__()
+        # This combines the Wq, Wk and Wv matrices into one matrix
+        self.in_proj=nn.Linear(d_embed, 3*d_embed, bias=in_proj_bias)
+        # This one represents the Wo matrix
+        self.out_proj=nn.Linear(d_embed, d_embed, bias=out_proj_bias)
+        self.n_heads=n_heads
+        self.d_head=d_embed // self.n_heads
+    def forward(self, x, causal_mask=False):
+        # x: (batch_size, seq_len, dim)
+        input_shape = x.shape
+        batch_size, sequence_length, d_embed = input_shape
+        interim_shape = (batch_size, sequence_length, self.n_heads, self.d_head)
+        # Apply the in_proj to get the queries, keys, and values all at once
+        # (batch_size, seq_len, dim) -> (batch_size, seq_len, 3 * dim)
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+        # Reshape to (batch_size, seq_len, n_heads, d_head)
+        q = q.view(interim_shape)
+        k = k.view(interim_shape)
+        v = v.view(interim_shape)
+        # Transpose for attention dot product: (batch_size, n_heads, seq_len, d_head)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        # (batch_size, n_heads, seq_len, d_head) @ (batch_size, n_heads, d_head, seq_len) -> (batch_size, n_heads, seq_len, seq_len)
+        attention_weights = q @ k.transpose(-1, -2)
+        # Scaling by sqrt(d_head)
+        attention_weights = attention_weights / math.sqrt(self.d_head)
+        # Causal mask to prevent attending to future tokens
+        if causal_mask:
+            mask = torch.ones_like(attention_weights, dtype=torch.bool).triu(1)
+            attention_weights.masked_fill_(mask, -torch.inf)
+        # Apply softmax to get attention probabilities
+        attention_weights = F.softmax(attention_weights, dim=-1)
+        # Apply attention weights: (batch_size, n_heads, seq_len, seq_len) @ (batch_size, n_heads, seq_len, d_head) -> (batch_size, n_heads, seq_len, d_head)
+        output = attention_weights @ v
+        # Transpose back: (batch_size, seq_len, n_heads, d_head)
+        output = output.transpose(1, 2)
+        # Reshape back to (batch_size, seq_len, dim)
+        output = output.reshape(input_shape)
+        # Apply output projection
+        output = self.out_proj(output)
+        return output
+class CrossAttention(nn.Module):
+    def __init__(self, n_heads, d_embed, d_cross, in_proj_bias=True, out_proj_bias=True):
+        super().__init__()
+        self.q_proj   = nn.Linear(d_embed, d_embed, bias=in_proj_bias)
+        self.k_proj   = nn.Linear(d_cross, d_embed, bias=in_proj_bias)
+        self.v_proj   = nn.Linear(d_cross, d_embed, bias=in_proj_bias)
+        self.out_proj = nn.Linear(d_embed, d_embed, bias=out_proj_bias)
+        self.n_heads = n_heads
+        self.d_head = d_embed // n_heads
+    def forward(self, x, y):
+        # x (latent): # (Batch_Size, Seq_Len_Q, Dim_Q)
+        # y (context): # (Batch_Size, Seq_Len_KV, Dim_KV) = (Batch_Size, 77, 768)
+        input_shape = x.shape
+        batch_size, sequence_length, d_embed = input_shape
+        # Divide each embedding of Q into multiple heads such that d_heads * n_heads = Dim_Q
+        interim_shape = (batch_size, -1, self.n_heads, self.d_head)
+        # (Batch_Size, Seq_Len_Q, Dim_Q) -> (Batch_Size, Seq_Len_Q, Dim_Q)
+        q = self.q_proj(x)
+        # (Batch_Size, Seq_Len_KV, Dim_KV) -> (Batch_Size, Seq_Len_KV, Dim_Q)
+        k = self.k_proj(y)
+        v = self.v_proj(y)
+        # (Batch_Size, Seq_Len_Q, Dim_Q) -> (Batch_Size, Seq_Len_Q, H, Dim_Q / H) -> (Batch_Size, H, Seq_Len_Q, Dim_Q / H)
+        q = q.view(interim_shape).transpose(1, 2)
+        # (Batch_Size, Seq_Len_KV, Dim_Q) -> (Batch_Size, Seq_Len_KV, H, Dim_Q / H) -> (Batch_Size, H, Seq_Len_KV, Dim_Q / H)
+        k = k.view(interim_shape).transpose(1, 2)
+        v = v.view(interim_shape).transpose(1, 2)
+        # (Batch_Size, H, Seq_Len_Q, Dim_Q / H) @ (Batch_Size, H, Dim_Q / H, Seq_Len_KV) -> (Batch_Size, H, Seq_Len_Q, Seq_Len_KV)
+        weight = q @ k.transpose(-1, -2)
+        # (Batch_Size, H, Seq_Len_Q, Seq_Len_KV)
+        weight /= math.sqrt(self.d_head)
+        weight = F.softmax(weight, dim=-1)
+        # (Batch_Size, H, Seq_Len_Q, Seq_Len_KV) @ (Batch_Size, H, Seq_Len_KV, Dim_Q / H) -> (Batch_Size, H, Seq_Len_Q, Dim_Q / H)
+        output = weight @ v
+        # (Batch_Size, H, Seq_Len_Q, Dim_Q / H) -> (Batch_Size, Seq_Len_Q, H, Dim_Q / H)
+        output = output.transpose(1, 2).contiguous()
+        # (Batch_Size, Seq_Len_Q, H, Dim_Q / H) -> (Batch_Size, Seq_Len_Q, Dim_Q)
+        output = output.view(input_shape)
+        # (Batch_Size, Seq_Len_Q, Dim_Q) -> (Batch_Size, Seq_Len_Q, Dim_Q)
+        output = self.out_proj(output)
+        # (Batch_Size, Seq_Len_Q, Dim_Q)
+        return output

clip.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from attention import SelfAttention
+class CLIPEmbedding(nn.Module):
+    def __init__(self, n_vocab, n_embed, n_token):
+        super().__init__()
+        self.token_embedding=nn.Embedding(n_vocab, n_embed)
+        self.position_embedding=nn.Parameter(torch.zeros((n_token, n_embed)))
+    def forward(self, tokens: torch.Tensor):
+        x=self.token_embedding(tokens)
+        x+=self.position_embedding
+        return x
+class CLIPLayer(nn.Module):
+    def __init__(self, n_head, n_embed):
+        super().__init__()
+        self.layernorm_1=nn.LayerNorm(n_embed)
+        self.attention=SelfAttention(n_head, n_embed)
+        self.layernorm_2=nn.LayerNorm(n_embed)
+        self.linear_1=nn.Linear(n_embed, 4*n_embed)
+        self.linear_2=nn.Linear(4*n_embed, n_embed)
+    def forward(self, x):
+        residue=x
+        x=self.layernorm_1(x)
+        x=self.attention(x, causal_mask=True)
+        x+=residue
+        residue=x
+        x=self.layernorm_2(x)
+        x=self.linear_1(x)
+        x=x*torch.sigmoid(1.702*x)
+        x=self.linear_2(x)
+        x+=residue
+        return x
+class CLIP(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embedding=CLIPEmbedding(49408, 768, 77)
+        self.layers=nn.ModuleList([
+            CLIPLayer(12, 768) for i in range(12)
+        ])
+        self.layernorm=nn.LayerNorm(768)
+    def forward(self, tokens: torch.LongTensor) -> torch.FloatTensor:
+        tokens=tokens.type(torch.long)
+        state=self.embedding(tokens)
+        for layer in self.layers:
+            state=layer(state)
+        output=self.layernorm(state)
+        return output
+if __name__ == "__main__":
+    dummy_tokens = torch.randint(0, 49408, (1, 77))  # (Batch_Size, Seq_Len)
+    # Instantiate the model
+    model = CLIP()
+    # Forward pass
+    with torch.no_grad():  # no need to track gradients for testing
+        output = model(dummy_tokens)
+    # Print the output shape
+    # Output shape: torch.Size([1, 77, 768])
+    print("Output shape:", output.shape)

ddpm.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+import numpy as np
+class DDPMSampler:
+    def __init__(self, generator: torch.Generator, num_training_steps=1000, beta_start: float = 0.00085, beta_end: float = 0.0120):
+        # Params "beta_start" and "beta_end" taken from: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/configs/stable-diffusion/v1-inference.yaml#L5C8-L5C8
+        # For the naming conventions, refer to the DDPM paper (https://arxiv.org/pdf/2006.11239.pdf)
+        self.betas = torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_training_steps, dtype=torch.float32) ** 2
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+        self.generator = generator
+        self.num_train_timesteps = num_training_steps
+        self.timesteps = torch.from_numpy(np.arange(0, num_training_steps)[::-1].copy())
+    def set_inference_timesteps(self, num_inference_steps=50):
+        self.num_inference_steps = num_inference_steps
+        step_ratio = self.num_train_timesteps // self.num_inference_steps
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+        self.timesteps = torch.from_numpy(timesteps)
+    def _get_previous_timestep(self, timestep: int) -> int:
+        prev_t = timestep - self.num_train_timesteps // self.num_inference_steps
+        return prev_t
+    def _get_variance(self, timestep: int) -> torch.Tensor:
+        prev_t = self._get_previous_timestep(timestep)
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
+        # we always take the log of variance, so clamp it to ensure it's not 0
+        variance = torch.clamp(variance, min=1e-20)
+        return variance
+    def set_strength(self, strength=1):
+        """
+            Set how much noise to add to the input image.
+            More noise (strength ~ 1) means that the output will be further from the input image.
+            Less noise (strength ~ 0) means that the output will be closer to the input image.
+        """
+        # start_step is the number of noise levels to skip
+        start_step = self.num_inference_steps - int(self.num_inference_steps * strength)
+        self.timesteps = self.timesteps[start_step:]
+        self.start_step = start_step
+    def step(self, timestep: int, latents: torch.Tensor, model_output: torch.Tensor):
+        t = timestep
+        prev_t = self._get_previous_timestep(t)
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample = (latents - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * latents
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            device = model_output.device
+            noise = torch.randn(model_output.shape, generator=self.generator, device=device, dtype=model_output.dtype)
+            # Compute the variance as per formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+            variance = (self._get_variance(t) ** 0.5) * noise
+        # sample from N(mu, sigma) = X can be obtained by X = mu + sigma * N(0, 1)
+        # the variable "variance" is already multiplied by the noise N(0, 1)
+        pred_prev_sample = pred_prev_sample + variance
+        return pred_prev_sample
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        # Sample from q(x_t | x_0) as in equation (4) of https://arxiv.org/pdf/2006.11239.pdf
+        # Because N(mu, sigma) = X can be obtained by X = mu + sigma * N(0, 1)
+        # here mu = sqrt_alpha_prod * original_samples and sigma = sqrt_one_minus_alpha_prod
+        noise = torch.randn(original_samples.shape, generator=self.generator, device=original_samples.device, dtype=original_samples.dtype)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples

decoder.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+from torch import nn
+from attention import SelfAttention
+from torch.nn import functional as F
+class VAE_ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.grpnorm_1=nn.GroupNorm(32, in_channels)
+        self.conv_1=nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+        self.grpnorm_2=nn.GroupNorm(32, out_channels)
+        self.conv_2=nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
+        if in_channels == out_channels:
+            self.residual_layer=nn.Identity()
+        else:
+            self.residual_layer=nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
+    def forward(self, x):
+        residue=x
+        x=self.grpnorm_1(x)
+        x=F.silu(x)
+        x=self.conv_1(x)
+        x=self.grpnorm_2(x)
+        x=F.silu(x)
+        x=self.conv_2(x)
+        return x+self.residual_layer(residue)
+class VAE_AttentionBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.grpnorm=nn.GroupNorm(32, channels)
+        self.attention=SelfAttention(1, channels)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (Batch_Size, Features, Height, Width)
+        residue=x
+        # (Batch_Size, Features, Height, Width) -> (Batch_Size, Features, Height, Width)
+        x=self.grpnorm(x)
+        n, c, h, w=x.shape
+        # (Batch_Size, Features, Height, Width) -> (Batch_Size, Features, Height * Width)
+        x=x.view((n,c,h*w))
+        # (Batch_Size, Features, Height * Width) -> (Batch_Size, Height * Width, Features)
+        x=x.transpose(-1, -2)
+        # (Batch_Size, Height * Width, Features) -> (Batch_Size, Height * Width, Features) -> (Batch_Size, Height * Width, Features)
+        x=self.attention(x)
+        # (Batch_Size, Height * Width, Features)  -> (Batch_Size, Features, Height * Width)
+        x=x.transpose(-1, -2)
+        # (Batch_Size, Features, Height , Width)
+        x=x.view((n, c, h, w))
+        x+=residue
+        return x
+class VAE_Decoder(nn.Sequential):
+    def __init__(self):
+        super().__init__(
+            nn.Conv2d(4, 4, kernel_size=1, padding=0),
+            nn.Conv2d(4, 512, kernel_size=3, padding=1),
+            VAE_ResidualBlock(512, 512),
+            VAE_AttentionBlock(512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            nn.Upsample(scale_factor=2),
+            nn.Conv2d(512, 512, kernel_size=3, padding=1),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            nn.Upsample(scale_factor=2),
+            nn.Conv2d(512, 512, kernel_size=3, padding=1),
+            VAE_ResidualBlock(512, 256),
+            VAE_ResidualBlock(256, 256),
+            VAE_ResidualBlock(256, 256),
+            nn.Upsample(scale_factor=2),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            VAE_ResidualBlock(256, 128),
+            VAE_ResidualBlock(128, 128),
+            VAE_ResidualBlock(128, 128),
+            nn.GroupNorm(32, 128),
+            nn.SiLU(),
+            nn.Conv2d(128, 3, kernel_size=3, padding=1),
+        )
+    def forward(self, x):
+        x/=0.18215
+        for module in self:
+            x=module(x)
+        return x
+if __name__ == "__main__":
+    model = VAE_Decoder()
+    model.eval()
+    # Create a dummy input tensor: (batch_size=1, channels=4, height=16, width=16)
+    x = torch.randn(1, 4, 8, 8)
+    with torch.no_grad():
+        output = model(x)
+    print("Input shape :", x.shape)
+    print("Output shape:", output.shape)

diffusion.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from attention import SelfAttention, CrossAttention
+class TimeEmbedding(nn.Module):
+    def __init__(self, n_embed):
+        super().__init__()
+        self.linear_1=nn.Linear(n_embed, 4*n_embed)
+        self.linear_2=nn.Linear(4*n_embed, 4*n_embed)
+    def forward(self, x):
+        x=self.linear_1(x)
+        x=F.silu(x)
+        x=self.linear_2(x)
+        return x
+class UNET_ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, n_time=1280):
+        super().__init__()
+        self.grpnorm_feature=nn.GroupNorm(32, in_channels)
+        self.conv_feature=nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+        self.linear_time=nn.Linear(n_time, out_channels)
+        self.grpnorm_merged=nn.GroupNorm(32, out_channels)
+        self.conv_merged=nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
+        if in_channels==out_channels:
+            self.residual_layer=nn.Identity()
+        else:
+            self.residual_layer=nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
+    def forward(self, feature, time):
+        residue=feature
+        feature=self.grpnorm_feature(feature)
+        feature=F.silu(feature)
+        feature=self.conv_feature(feature)
+        time=F.silu(time)
+        time=self.linear_time(time)
+        merged=feature+time.unsqueeze(-1).unsqueeze(-1)
+        merged=self.grpnorm_merged(merged)
+        merged=F.silu(merged)
+        merged=self.conv_merged(merged)
+        return merged + self.residual_layer(residue)
+class UNET_AttentionBlock(nn.Module):
+    def __init__(self, n_head, n_embed, d_context=768):
+        super().__init__()
+        channels=n_head*n_embed
+        self.grpnorm=nn.GroupNorm(32, channels, eps=1e-6)
+        self.conv_input=nn.Conv2d(channels, channels, kernel_size=1, padding=0)
+        self.layernorm_1=nn.LayerNorm(channels)
+        self.attention_1=SelfAttention(n_head, channels, in_proj_bias=False)
+        self.layernorm_2=nn.LayerNorm(channels)
+        self.attention_2=CrossAttention(n_head, channels, d_context, in_proj_bias=False)
+        self.layernorm_3=nn.LayerNorm(channels)
+        self.linear_geglu_1=nn.Linear(channels, 4*channels*2)
+        self.linear_geglu_2=nn.Linear(4*channels, channels)
+        self.conv_output=nn.Conv2d(channels, channels, kernel_size=1, padding=0)
+    def forward(self, x, context):
+        residue_long=x
+        x=self.grpnorm(x)
+        x=self.conv_input(x)
+        n, c, h, w=x.shape
+        x=x.view((n,c,h*w))
+        x=x.transpose(-1, -2)
+        residue_short=x
+        x=self.layernorm_1(x)
+        x=self.attention_1(x)
+        x+=residue_short
+        residue_short=x
+        x=self.layernorm_2(x)
+        x=self.attention_2(x, context)
+        x+=residue_short
+        residue_short=x
+        x=self.layernorm_3(x)
+        x, gate=self.linear_geglu_1(x).chunk(2, dim=-1)
+        x=x*F.gelu(gate)
+        x=self.linear_geglu_2(x)
+        x+=residue_short
+        x=x.transpose(-1, -2)
+        x=x.view((n, c, h, w))
+        return self.conv_output(x)+residue_long
+class Upsample(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.conv=nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+    def forward(self, x):
+        x=F.interpolate(x, scale_factor=2, mode='nearest')
+        return self.conv(x)
+# passing arguments to the parent class nn.Sequential, not to your SwitchSequential class directly — because you did not override the __init__ method in SwitchSequential
+class SwitchSequential(nn.Sequential):
+    def forward(self, x, context, time):
+        for layer in self:
+            if isinstance(layer, UNET_AttentionBlock):
+                x=layer(x, context)
+            elif isinstance(layer, UNET_ResidualBlock):
+                x=layer(x, time)
+            else:
+                x=layer(x)
+        return x
+class UNET(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.encoders=nn.ModuleList([
+            # (Batch_Size, 4, Height / 8, Width / 8) -> (Batch_Size, 320, Height / 8, Width / 8)
+            SwitchSequential(nn.Conv2d(4, 320, kernel_size=3, padding=1)),
+            # (Batch_Size, 320, Height / 8, Width / 8) -> # (Batch_Size, 320, Height / 8, Width / 8) -> (Batch_Size, 320, Height / 8, Width / 8)
+            SwitchSequential(UNET_ResidualBlock(320, 320), UNET_AttentionBlock(8, 40)),
+            SwitchSequential(UNET_ResidualBlock(320, 320), UNET_AttentionBlock(8, 40)),
+            # (Batch_Size, 320, Height / 8, Width / 8) -> (Batch_Size, 320, Height / 16, Width / 16)
+            SwitchSequential(nn.Conv2d(320, 320, kernel_size=3, stride=2, padding=1)),
+            # (Batch_Size, 320, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16)
+            SwitchSequential(UNET_ResidualBlock(320, 640), UNET_AttentionBlock(8, 80)),
+            # (Batch_Size, 640, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16)
+            SwitchSequential(UNET_ResidualBlock(640, 640), UNET_AttentionBlock(8, 80)),
+            # (Batch_Size, 640, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 32, Width / 32)
+            SwitchSequential(nn.Conv2d(640, 640, kernel_size=3, stride=2, padding=1)),
+            # (Batch_Size, 640, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 32, Width / 32)
+            SwitchSequential(UNET_ResidualBlock(640, 1280), UNET_AttentionBlock(8, 160)),
+            # (Batch_Size, 1280, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 32, Width / 32)
+            SwitchSequential(UNET_ResidualBlock(1280, 1280), UNET_AttentionBlock(8, 160)),
+            # (Batch_Size, 1280, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 64, Width / 64)
+            SwitchSequential(nn.Conv2d(1280, 1280, kernel_size=3, stride=2, padding=1)),
+            # (Batch_Size, 1280, Height / 64, Width / 64) -> (Batch_Size, 1280, Height / 64, Width / 64)
+            SwitchSequential(UNET_ResidualBlock(1280, 1280)),
+            SwitchSequential(UNET_ResidualBlock(1280, 1280)),
+        ])
+        self.bottleneck = SwitchSequential(
+            # (Batch_Size, 1280, Height / 64, Width / 64) -> (Batch_Size, 1280, Height / 64, Width / 64)
+            UNET_ResidualBlock(1280, 1280),
+            UNET_AttentionBlock(8, 160),
+            UNET_ResidualBlock(1280, 1280),
+        )
+        self.decoders = nn.ModuleList([
+            # (Batch_Size, 2560, Height / 64, Width / 64) -> (Batch_Size, 1280, Height / 64, Width / 64)
+            SwitchSequential(UNET_ResidualBlock(2560, 1280)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280)),
+            # (Batch_Size, 2560, Height / 64, Width / 64) -> (Batch_Size, 1280, Height / 64, Width / 64) -> (Batch_Size, 1280, Height / 32, Width / 32)
+            SwitchSequential(UNET_ResidualBlock(2560, 1280), Upsample(1280)),
+            # (Batch_Size, 2560, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 32, Width / 32)
+            SwitchSequential(UNET_ResidualBlock(2560, 1280), UNET_AttentionBlock(8, 160)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280), UNET_AttentionBlock(8, 160)),
+            # (Batch_Size, 1920, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 32, Width / 32) -> (Batch_Size, 1280, Height / 16, Width / 16)
+            SwitchSequential(UNET_ResidualBlock(1920, 1280), UNET_AttentionBlock(8, 160), Upsample(1280)),
+            # (Batch_Size, 1920, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16)
+            SwitchSequential(UNET_ResidualBlock(1920, 640), UNET_AttentionBlock(8, 80)),
+            # (Batch_Size, 1280, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16)
+            SwitchSequential(UNET_ResidualBlock(1280, 640), UNET_AttentionBlock(8, 80)),
+            # (Batch_Size, 960, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 16, Width / 16) -> (Batch_Size, 640, Height / 8, Width / 8)
+            SwitchSequential(UNET_ResidualBlock(960, 640), UNET_AttentionBlock(8, 80), Upsample(640)),
+            # (Batch_Size, 960, Height / 8, Width / 8) -> (Batch_Size, 320, Height / 8, Width / 8) -> (Batch_Size, 320, Height / 8, Width / 8)
+            SwitchSequential(UNET_ResidualBlock(960, 320), UNET_AttentionBlock(8, 40)),
+            # (Batch_Size, 640, Height / 8, Width / 8) -> (Batch_Size, 320, Height / 8, Width / 8) -> (Batch_Size, 320, Height / 8, Width / 8)
+            SwitchSequential(UNET_ResidualBlock(640, 320), UNET_AttentionBlock(8, 40)),
+            SwitchSequential(UNET_ResidualBlock(640, 320), UNET_AttentionBlock(8, 40)),
+        ])
+    def forward(self, x, context, time):
+        # x: (Batch_Size, 4, Height / 8, Width / 8)
+        # context: (Batch_Size, Seq_Len, Dim)
+        # time: (1, 1280)
+        skip_connections = []
+        for layers in self.encoders:
+            x = layers(x, context, time)
+            skip_connections.append(x)
+        x = self.bottleneck(x, context, time)
+        for layers in self.decoders:
+            # Since we always concat with the skip connection of the encoder, the number of features increases before being sent to the decoder's layer
+            x = torch.cat((x, skip_connections.pop()), dim=1)
+            x = layers(x, context, time)
+        return x
+class UNET_OutputLayer(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.grpnorm = nn.GroupNorm(32, in_channels)
+        self.conv=nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+    def forward(self, x):
+        x=self.grpnorm(x)
+        x=F.silu(x)
+        x=self.conv(x)
+        return x
+class Diffusion(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.time_embedding=TimeEmbedding(320)
+        self.unet=UNET()
+        self.final=UNET_OutputLayer(320, 4)
+    def forward(self, latent, context, time):
+        time=self.time_embedding(time)
+        output=self.unet(latent, context, time)
+        output=self.final(output)
+        return output
+if __name__ == "__main__":
+    # Dummy inputs
+    batch_size = 10
+    height = 64
+    width = 64
+    in_channels = 4
+    context_dim = 768
+    seq_len = 77
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Create model and move to device
+    model = Diffusion().to(device)
+    # Random input tensor with 4 channels
+    x = torch.randn(batch_size, in_channels, height, width).to(device)
+    print('Input shape to UNET: ', x.shape)
+    # Time embedding (e.g., timestep from a diffusion schedule)
+    t = torch.randn(batch_size, 320).to(device)
+    print('Time Embedding shape to UNET: ',t.shape)
+    # Context for cross attention (e.g., text embedding from CLIP or transformer)
+    context = torch.randn(batch_size, seq_len, context_dim).to(device)
+    print('context shape to UNET: ', context.shape)
+    # Forward pass
+    with torch.no_grad():
+        output = model(x, context, t)
+        print(output)
+    print("Output shape of UNET:", output.shape)

dog.jpg ADDED Viewed

encoder.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from decoder import VAE_AttentionBlock, VAE_ResidualBlock
+class VAE_Encoder(nn.Sequential):
+    def __init__(self):
+        super().__init__(
+            # (Batch_Size, Channel, Height, Width) -> (Batch_Size, 128, Height, Width)
+            nn.Conv2d(3, 128, kernel_size=3, padding=1),
+            # (Batch_Size, 128, Height, Width) -> (Batch_Size, 128, Height, Width)
+            VAE_ResidualBlock(128, 128),
+            VAE_ResidualBlock(128, 128),
+            # (Batch_Size, 128, Height, Width) -> (Batch_Size, 128, Height/2 , Width/2)
+            nn.Conv2d(128, 128, kernel_size=3,stride=2, padding=0),
+            # (Batch_Size, 128, Height/2 , Width/2) -> (Batch_Size, 256, Height/2 , Width/2)
+            VAE_ResidualBlock(128, 256),
+            # (Batch_Size, 256, Height/2 , Width/2) -> (Batch_Size, 256, Height/2 , Width/2)
+            VAE_ResidualBlock(256, 256),
+            # (Batch_Size, 256, Height/2 , Width/2) -> (Batch_Size, 256, Height/4 , Width/4)
+            nn.Conv2d(256, 256, kernel_size=3,stride=2, padding=0),
+            # (Batch_Size, 256, Height/4 , Width/4) -> (Batch_Size, 512, Height/4 , Width/4)
+            VAE_ResidualBlock(256, 512),
+            # (Batch_Size, 512, Height/4 , Width/4) -> (Batch_Size, 512, Height/4 , Width/4)
+            VAE_ResidualBlock(512, 512),
+            # (Batch_Size, 512, Height/4 , Width/4) -> (Batch_Size, 512, Height/8 , Width/8)
+            nn.Conv2d(512, 512, kernel_size=3,stride=2, padding=0),
+            # (Batch_Size, 512, Height/8 , Width/8) -> (Batch_Size, 512, Height/8 , Width/8)
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            # (Batch_Size, 512, Height/8 , Width/8) -> (Batch_Size, 512, Height/8 , Width/8)
+            VAE_AttentionBlock(512),
+            VAE_ResidualBlock(512, 512),
+            nn.GroupNorm(32, 512),
+            nn.SiLU(),
+            nn.Conv2d(512, 8, kernel_size=3, padding=1),
+            # (Batch_Size, 8, Height/8, Width/8) -> (Batch_Size, 8, Height/8, Width/8)
+            nn.Conv2d(8, 8, kernel_size=1, padding=0)
+        )
+    def forward(self, x: torch.Tensor, noise: torch.Tensor) -> torch.Tensor:
+        for module in self:
+            if getattr(module, 'stride', None) == (2, 2):
+                x=F.pad(x, (0,1,0,1))
+            x=module(x)
+        # (Batch_Size, 8, Height, Height/8, Width/8) -> two tensors of shape (Batch_Size, 4, Height/8, Width/8)
+        mean, log_var=torch.chunk(x, 2, dim=1)
+        log_var=torch.clamp(log_var, -30, 20)
+        var=log_var.exp()
+        stdev=var.sqrt()
+        Z=mean + stdev * noise
+        Z*=0.18215
+        # print('-'*100)
+        # print('Z shape: ', Z.shape)
+        # print('-'*100)
+        return Z
+if __name__ == "__main__":
+    model = VAE_Encoder()
+    model.eval()
+    # Create a dummy input tensor: (batch_size=1, channels=3, height=64, width=64)
+    x = torch.randn(1, 3, 64, 64)
+    noise = torch.randn(1, 4, 8, 8)  # Match the latent shape (Z)
+    with torch.no_grad():
+        output = model(x, noise)
+    print("Input shape :", x.shape)
+    print("Output shape:", output.shape)

interface.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import gradio as gr
+import torch
+from PIL import Image
+from transformers import CLIPTokenizer
+# Import your existing model and pipeline modules
+import model
+import pipeline
+# Device Configuration
+ALLOW_CUDA = True
+ALLOW_MPS = False
+def determine_device():
+    if torch.cuda.is_available() and ALLOW_CUDA:
+        return "cuda"
+    elif (torch.backends.mps.is_built() or torch.backends.mps.is_available()) and ALLOW_MPS:
+        return "mps"
+    return "cpu"
+DEVICE = determine_device()
+print(f"Using device: {DEVICE}")
+# Load tokenizer and models
+tokenizer = CLIPTokenizer("vocab.json", merges_file="merges.txt")
+model_file = "inkpunk-diffusion-v1.ckpt"
+models = model.preload_models_from_standard_weights(model_file, DEVICE)
+# models=None
+def generate_image(
+    prompt,
+    uncond_prompt="",
+    do_cfg=True,
+    cfg_scale=8,
+    sampler="ddpm",
+    num_inference_steps=50,
+    seed=42,
+    input_image=None,
+    strength=1.0
+):
+    """
+    Generate an image using the Stable Diffusion pipeline
+    Args:
+    - prompt (str): Text description of the image to generate
+    - uncond_prompt (str, optional): Negative prompt to guide generation
+    - do_cfg (bool): Whether to use classifier-free guidance
+    - cfg_scale (float): Classifier-free guidance scale
+    - sampler (str): Sampling method
+    - num_inference_steps (int): Number of denoising steps
+    - seed (int): Random seed for reproducibility
+    - input_image (PIL.Image, optional): Input image for image-to-image generation
+    - strength (float): Strength of image transformation (0-1)
+    Returns:
+    - PIL.Image: Generated image
+    """
+    try:
+        # Ensure input_image is None if not provided
+        if input_image is None:
+            strength = 1.0
+        # Generate the image
+        output_image = pipeline.generate(
+            prompt=prompt,
+            uncond_prompt=uncond_prompt,
+            input_image=input_image,
+            strength=strength,
+            do_cfg=do_cfg,
+            cfg_scale=cfg_scale,
+            sampler_name=sampler,
+            n_inference_steps=num_inference_steps,
+            seed=seed,
+            models=models,
+            device=DEVICE,
+            idle_device="cuda",
+            tokenizer=tokenizer,
+        )
+        # Convert numpy array to PIL Image
+        return Image.fromarray(output_image)
+    except Exception as e:
+        print(f"Error generating image: {e}")
+        return None
+def launch_gradio_interface():
+    """
+    Create and launch Gradio interface for Stable Diffusion
+    """
+    with gr.Blocks(title="Stable Diffusion Image Generator") as demo:
+        gr.Markdown("# 🎨 Stable Diffusion Image Generator")
+        with gr.Row():
+            with gr.Column():
+                # Text Inputs
+                prompt = gr.Textbox(label="Prompt",
+                                    placeholder="Describe the image you want to generate...")
+                uncond_prompt = gr.Textbox(label="Negative Prompt (Optional)",
+                                            placeholder="Describe what you don't want in the image...")
+                # Generation Parameters
+                with gr.Accordion("Advanced Settings", open=False):
+                    do_cfg = gr.Checkbox(label="Use Classifier-Free Guidance", value=True)
+                    cfg_scale = gr.Slider(minimum=1, maximum=14, value=8, label="CFG Scale")
+                    sampler = gr.Dropdown(
+                        choices=["ddpm", "ddim", "pndm"],  # Add more samplers if available
+                        value="ddpm",
+                        label="Sampling Method"
+                    )
+                    num_inference_steps = gr.Slider(
+                        minimum=10,
+                        maximum=100,
+                        value=50,
+                        label="Number of Inference Steps"
+                    )
+                    seed = gr.Number(value=42, label="Random Seed")
+                # Image-to-Image Section
+                with gr.Accordion("Image-to-Image", open=False):
+                    input_image = gr.Image(type="pil", label="Input Image (Optional)")
+                    strength = gr.Slider(
+                        minimum=0,
+                        maximum=1,
+                        value=0.8,
+                        label="Image Transformation Strength"
+                    )
+                # Generate Button
+                generate_btn = gr.Button("Generate Image", variant="primary")
+        with gr.Row():
+            # Output Image
+            output_image = gr.Image(label="Generated Image")
+        # Connect Button to Generation Function
+        generate_btn.click(
+            fn=generate_image,
+            inputs=[
+                prompt, uncond_prompt, do_cfg, cfg_scale,
+                sampler, num_inference_steps, seed,
+                input_image, strength
+            ],
+            outputs=output_image
+        )
+    # Launch the interface
+    demo.launch(server_name="0.0.0.0", server_port=7860)
+if __name__ == "__main__":
+    launch_gradio_interface()

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from clip import CLIP
+from encoder import VAE_Encoder
+from decoder import VAE_Decoder
+from diffusion import Diffusion
+import model_converter
+def preload_models_from_standard_weights(ckpt_path, device):
+    state_dict=model_converter.load_from_standard_weights(ckpt_path, device)
+    encoder=VAE_Encoder().to(device)
+    encoder.load_state_dict(state_dict['encoder'], strict=True)
+    decoder=VAE_Decoder().to(device)
+    decoder.load_state_dict(state_dict['decoder'], strict=True)
+    diffusion=Diffusion().to(device)
+    diffusion.load_state_dict(state_dict['diffusion'], strict=True)
+    clip=CLIP().to(device)
+    clip.load_state_dict(state_dict['clip'], strict=True)
+    return {
+        'clip': clip,
+        'encoder': encoder,
+        'decoder': decoder,
+        'diffusion': diffusion,
+    }

model_converter.py ADDED Viewed

The diff for this file is too large to render. See raw diff

pipeline.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+import numpy as np
+from tqdm import tqdm
+from ddpm import DDPMSampler
+WIDTH = 512
+HEIGHT = 512
+LATENTS_WIDTH = WIDTH // 8
+LATENTS_HEIGHT = HEIGHT // 8
+def generate(
+    prompt,
+    uncond_prompt=None,
+    input_image=None,
+    strength=0.8,
+    do_cfg=True,
+    cfg_scale=7.5,
+    sampler_name="ddpm",
+    n_inference_steps=50,
+    models={},
+    seed=None,
+    device=None,
+    idle_device=None,
+    tokenizer=None,
+):
+    with torch.no_grad():
+        if not 0 < strength <= 1:
+            raise ValueError("strength must be between 0 and 1")
+        if idle_device:
+            to_idle = lambda x: x.to(idle_device)
+        else:
+            to_idle = lambda x: x
+        # Initialize random number generator according to the seed specified
+        generator = torch.Generator(device=device)
+        if seed is None:
+            generator.seed()
+        else:
+            generator.manual_seed(seed)
+        clip = models["clip"]
+        clip.to(device)
+        if do_cfg:
+            # Convert into a list of length Seq_Len=77
+            cond_tokens = tokenizer.batch_encode_plus(
+                [prompt], padding="max_length", max_length=77
+            ).input_ids
+            # (Batch_Size, Seq_Len)
+            cond_tokens = torch.tensor(cond_tokens, dtype=torch.long, device=device)
+            # (Batch_Size, Seq_Len) -> (Batch_Size, Seq_Len, Dim)
+            cond_context = clip(cond_tokens)
+            # Convert into a list of length Seq_Len=77
+            uncond_tokens = tokenizer.batch_encode_plus(
+                [uncond_prompt], padding="max_length", max_length=77
+            ).input_ids
+            # (Batch_Size, Seq_Len)
+            uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=device)
+            # (Batch_Size, Seq_Len) -> (Batch_Size, Seq_Len, Dim)
+            uncond_context = clip(uncond_tokens)
+            # (Batch_Size, Seq_Len, Dim) + (Batch_Size, Seq_Len, Dim) -> (2 * Batch_Size, Seq_Len, Dim)
+            context = torch.cat([cond_context, uncond_context])
+        else:
+            # Convert into a list of length Seq_Len=77
+            tokens = tokenizer.batch_encode_plus(
+                [prompt], padding="max_length", max_length=77
+            ).input_ids
+            # (Batch_Size, Seq_Len)
+            tokens = torch.tensor(tokens, dtype=torch.long, device=device)
+            # (Batch_Size, Seq_Len) -> (Batch_Size, Seq_Len, Dim)
+            context = clip(tokens)
+        to_idle(clip)
+        if sampler_name == "ddpm":
+            sampler = DDPMSampler(generator)
+            sampler.set_inference_timesteps(n_inference_steps)
+        else:
+            raise ValueError("Unknown sampler value %s. ")
+        latents_shape = (1, 4, LATENTS_HEIGHT, LATENTS_WIDTH)
+        if input_image:
+            encoder = models["encoder"]
+            encoder.to(device)
+            input_image_tensor = input_image.resize((WIDTH, HEIGHT))
+            # (Height, Width, Channel)
+            input_image_tensor = np.array(input_image_tensor)
+            # (Height, Width, Channel) -> (Height, Width, Channel)
+            input_image_tensor = torch.tensor(input_image_tensor, dtype=torch.float32, device=device)
+            input_image_tensor = rescale(input_image_tensor, (0, 255), (-1, 1))
+            # (Height, Width, Channel) -> (Batch_Size, Height, Width, Channel)
+            input_image_tensor = input_image_tensor.unsqueeze(0)
+            # (Batch_Size, Height, Width, Channel) -> (Batch_Size, Channel, Height, Width)
+            input_image_tensor = input_image_tensor.permute(0, 3, 1, 2)
+            # (Batch_Size, 4, Latents_Height, Latents_Width)
+            encoder_noise = torch.randn(latents_shape, generator=generator, device=device)
+            latents = encoder(input_image_tensor, encoder_noise)
+            # Add noise to the latents (the encoded input image)
+            # (Batch_Size, 4, Latents_Height, Latents_Width)
+            sampler.set_strength(strength=strength)
+            latents = sampler.add_noise(latents, sampler.timesteps[0])
+            to_idle(encoder)
+        else:
+            # (Batch_Size, 4, Latents_Height, Latents_Width)
+            latents = torch.randn(latents_shape, generator=generator, device=device)
+        diffusion = models["diffusion"]
+        diffusion.to(device)
+        timesteps = tqdm(sampler.timesteps)
+        for i, timestep in enumerate(timesteps):
+            # (1, 320)
+            time_embedding = get_time_embedding(timestep).to(device)
+            # (Batch_Size, 4, Latents_Height, Latents_Width)
+            model_input = latents
+            if do_cfg:
+                # (Batch_Size, 4, Latents_Height, Latents_Width) -> (2 * Batch_Size, 4, Latents_Height, Latents_Width)
+                model_input = model_input.repeat(2, 1, 1, 1)
+            # model_output is the predicted noise
+            # (Batch_Size, 4, Latents_Height, Latents_Width) -> (Batch_Size, 4, Latents_Height, Latents_Width)
+            model_output = diffusion(model_input, context, time_embedding)
+            if do_cfg:
+                output_cond, output_uncond = model_output.chunk(2)
+                model_output = cfg_scale * (output_cond - output_uncond) + output_uncond
+            # (Batch_Size, 4, Latents_Height, Latents_Width) -> (Batch_Size, 4, Latents_Height, Latents_Width)
+            latents = sampler.step(timestep, latents, model_output)
+        to_idle(diffusion)
+        decoder = models["decoder"]
+        decoder.to(device)
+        # (Batch_Size, 4, Latents_Height, Latents_Width) -> (Batch_Size, 3, Height, Width)
+        images = decoder(latents)
+        to_idle(decoder)
+        images = rescale(images, (-1, 1), (0, 255), clamp=True)
+        # (Batch_Size, Channel, Height, Width) -> (Batch_Size, Height, Width, Channel)
+        images = images.permute(0, 2, 3, 1)
+        images = images.to("cpu", torch.uint8).numpy()
+        return images[0]
+def rescale(x, old_range, new_range, clamp=False):
+    old_min, old_max = old_range
+    new_min, new_max = new_range
+    x -= old_min
+    x *= (new_max - new_min) / (old_max - old_min)
+    x += new_min
+    if clamp:
+        x = x.clamp(new_min, new_max)
+    return x
+def get_time_embedding(timestep):
+    # Shape: (160,)
+    freqs = torch.pow(10000, -torch.arange(start=0, end=160, dtype=torch.float32) / 160)
+    # Shape: (1, 160)
+    x = torch.tensor([timestep], dtype=torch.float32)[:, None] * freqs[None]
+    # Shape: (1, 160 * 2)
+    return torch.cat([torch.cos(x), torch.sin(x)], dim=-1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,115 @@

+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aiosignal==1.3.2
+annotated-types==0.7.0
+asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1733250440834/work
+attrs==25.3.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.0
+comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1733502965406/work
+contourpy==1.3.2
+cycler==0.12.1
+datasets==3.6.0
+debugpy @ file:///home/conda/feedstock_root/build_artifacts/debugpy_1744321233760/work
+decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1740384970518/work
+dill==0.3.8
+docker-pycreds==0.4.0
+exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1746947292760/work
+executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1745502089858/work
+filelock==3.18.0
+fonttools==4.58.0
+frozenlist==1.6.0
+fsspec==2025.3.0
+gitdb==4.0.12
+GitPython==3.1.44
+hf-xet==1.1.0
+huggingface-hub==0.31.1
+idna==3.10
+importlib_metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1737420181517/work
+ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1719845459717/work
+ipython @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_ipython_1745672166/work
+ipython_pygments_lexers @ file:///home/conda/feedstock_root/build_artifacts/ipython_pygments_lexers_1737123620466/work
+ipywidgets==8.1.7
+jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1733300866624/work
+Jinja2==3.1.6
+jupyter_client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1733440914442/work
+jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1727163409502/work
+jupyterlab_widgets==3.0.15
+kiwisolver==1.4.8
+lightning==2.5.1.post0
+lightning-utilities==0.14.3
+MarkupSafe==3.0.2
+matplotlib==3.10.3
+matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1733416936468/work
+mpmath==1.3.0
+multidict==6.4.3
+multiprocess==0.70.16
+nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1733325553580/work
+networkx==3.4.2
+numpy==2.2.5
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+packaging==24.2
+pandas==2.2.3
+parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1733271261340/work
+pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1733301927746/work
+pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1733327343728/work
+pillow==11.2.1
+platformdirs @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_platformdirs_1746710438/work
+prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1744724089886/work
+propcache==0.3.1
+protobuf==6.30.2
+psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1740663149797/work
+ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1733302279685/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl#sha256=92c32ff62b5fd8cf325bec5ab90d7be3d2a8ca8c8a3813ff487a8d2002630d1f
+pure_eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1733569405015/work
+pyarrow==20.0.0
+pydantic==2.11.4
+pydantic_core==2.33.2
+Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1736243443484/work
+pyparsing==3.2.3
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1733215673016/work
+pytorch-lightning==2.5.1.post0
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq @ file:///home/conda/feedstock_root/build_artifacts/pyzmq_1743831245578/work
+regex==2024.11.6
+requests==2.32.3
+safetensors==0.5.3
+sentry-sdk==2.27.0
+setproctitle==1.3.6
+six @ file:///home/conda/feedstock_root/build_artifacts/six_1733380938961/work
+smmap==5.0.2
+stack_data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1733569443808/work
+sympy==1.14.0
+tokenizers==0.21.1
+torch==2.7.0
+torchmetrics==1.7.1
+torchvision==0.22.0
+tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1732615904614/work
+tqdm==4.67.1
+traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1733367359838/work
+transformers==4.51.3
+triton==3.3.0
+typing-inspection==0.4.0
+typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_typing_extensions_1744302253/work
+tzdata==2025.2
+urllib3==2.4.0
+wandb==0.19.11
+wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1733231326287/work
+widgetsnbextension==4.0.14
+xxhash==3.5.0
+yarl==1.20.0
+zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1732827521216/work

test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff