Spaces:
Configuration error
Configuration error
| import gradio as gr | |
| from PIL import Image | |
| import torch | |
| from torchvision import transforms | |
| from transformers import ( | |
| CLIPProcessor, | |
| CLIPModel, | |
| CLIPTokenizer, | |
| CLIPTextModelWithProjection, | |
| CLIPVisionModelWithProjection, | |
| CLIPFeatureExtractor, | |
| ) | |
| import math | |
| from typing import List | |
| from PIL import Image, ImageChops | |
| import numpy as np | |
| import torch | |
| from diffusers import UnCLIPPipeline | |
| # from diffusers.utils.torch_utils import randn_tensor | |
| from transformers import CLIPTokenizer | |
| from src.priors.prior_transformer import ( | |
| PriorTransformer, | |
| ) # original huggingface prior transformer without time conditioning | |
| from src.pipelines.pipeline_kandinsky_prior import KandinskyPriorPipeline | |
| from diffusers import DiffusionPipeline | |
| import spaces | |
| __DEVICE__ = "cpu" | |
| if torch.cuda.is_available(): | |
| __DEVICE__ = "cuda" | |
| __DEVICE__ = "cuda" | |
| class Ours: | |
| def __init__(self, device): | |
| text_encoder = ( | |
| CLIPTextModelWithProjection.from_pretrained( | |
| "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", | |
| projection_dim=1280, | |
| torch_dtype=torch.float16, | |
| ) | |
| .eval() | |
| .requires_grad_(False) | |
| ) | |
| tokenizer = CLIPTokenizer.from_pretrained( | |
| "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" | |
| ) | |
| prior = PriorTransformer.from_pretrained( | |
| "ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior", | |
| torch_dtype=torch.float16, | |
| ) | |
| self.pipe_prior = KandinskyPriorPipeline.from_pretrained( | |
| "kandinsky-community/kandinsky-2-2-prior", | |
| prior=prior, | |
| text_encoder=text_encoder, | |
| tokenizer=tokenizer, | |
| torch_dtype=torch.float16, | |
| ).to(device) | |
| self.pipe = DiffusionPipeline.from_pretrained( | |
| "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 | |
| ).to(device) | |
| def inference(self, text, negative_text, steps, guidance_scale, width, height): | |
| gen_images = [] | |
| for i in range(2): | |
| image_emb, negative_image_emb = self.pipe_prior( | |
| text, negative_prompt=negative_text | |
| ).to_tuple() | |
| image = self.pipe( | |
| image_embeds=image_emb, | |
| negative_image_embeds=negative_image_emb, | |
| num_inference_steps=steps, | |
| guidance_scale=guidance_scale, | |
| width=width, | |
| height=height, | |
| ).images | |
| gen_images.append(image[0]) | |
| return gen_images | |
| selected_model = Ours(device=__DEVICE__) | |
| def get_images(text, negative_text, steps, guidance_scale, width, height, fixed_res): | |
| if fixed_res!="manual": | |
| print(f"Using {fixed_res} resolution") | |
| width, height = fixed_res.split("x") | |
| images = selected_model.inference(text, negative_text, steps, guidance_scale, width=int(width), height=int(height)) | |
| new_images = [] | |
| for img in images: | |
| new_images.append(img) | |
| return new_images | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """<h1 style="text-align: center;"><b>[CVPR 2024] <i>ECLIPSE</i>: Revisiting the Text-to-Image Prior for Effecient Image Generation</b></h1> | |
| <h1 style='text-align: center;'><a href='https://eclipse-t2i.vercel.app/'>Project Page</a> | <a href='https://arxiv.org/abs/2312.04655'>Paper</a> </h1> | |
| """ | |
| ) | |
| with gr.Group(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| text = gr.Textbox( | |
| label="Enter your prompt", | |
| show_label=False, | |
| max_lines=1, | |
| placeholder="Enter your prompt", | |
| elem_id="prompt-text-input", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| negative_text = gr.Textbox( | |
| label="Enter your negative prompt", | |
| show_label=False, | |
| max_lines=1, | |
| placeholder="Enter your negative prompt", | |
| elem_id="prompt-text-input", | |
| ) | |
| with gr.Row(): | |
| steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=1) | |
| guidance_scale = gr.Slider( | |
| label="Guidance Scale", minimum=0, maximum=10, value=7.5, step=0.1 | |
| ) | |
| with gr.Row(): | |
| with gr.Group(): | |
| width_inp = gr.Textbox( | |
| label="Please provide the width", | |
| value="512", | |
| max_lines=1, | |
| ) | |
| height_inp = gr.Textbox( | |
| label="Please provide the height", | |
| max_lines=1, | |
| value="512", | |
| ) | |
| fixed_res = gr.Dropdown( | |
| ["manual", "512x512", "1024x1024", "1920x1080", "1280x720"], value="manual", label="Prefined Resolution", info="Either select one or manually define one!" | |
| ) | |
| with gr.Row(): | |
| btn = gr.Button(value="Generate Image") | |
| gallery = gr.Gallery( | |
| label="Generated images", show_label=False, elem_id="gallery" | |
| , columns=[2], rows=[1], object_fit="contain", height="auto") | |
| btn.click( | |
| get_images, | |
| inputs=[ | |
| text, | |
| negative_text, | |
| steps, | |
| guidance_scale, | |
| width_inp, | |
| height_inp, | |
| fixed_res, | |
| ], | |
| outputs=gallery, | |
| ) | |
| text.submit( | |
| get_images, | |
| inputs=[ | |
| text, | |
| negative_text, | |
| steps, | |
| guidance_scale, | |
| width_inp, | |
| height_inp, | |
| fixed_res, | |
| ], | |
| outputs=gallery, | |
| ) | |
| negative_text.submit( | |
| get_images, | |
| inputs=[ | |
| text, | |
| negative_text, | |
| steps, | |
| guidance_scale, | |
| width_inp, | |
| height_inp, | |
| fixed_res, | |
| ], | |
| outputs=gallery, | |
| ) | |
| with gr.Accordion(label="Ethics & Privacy", open=False): | |
| gr.HTML( | |
| """<div class="acknowledgments"> | |
| <p><h4>Privacy</h4> | |
| We do not collect any images or key data. This demo is designed with sole purpose of fun and reducing misuse of AI. | |
| <p><h4>Biases and content acknowledgment</h4> | |
| This model will have the same biases as pre-trained CLIP model. </div> | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=20).launch() | |