Spaces:

pOpsPaper
/

pOps-space

Running on Zero

App Files Files Community

pOpsPaper commited on Jun 6, 2024

Commit

71d3bec

1 Parent(s): 5fdb8b2

Added space

Browse files

Files changed (14) hide show

app.py +101 -0
inputs/alexandra-zelena-phskyemu_c4-unsplash.jpg +0 -0
inputs/birmingham-museums-trust-q2OwlfXAYfo-unsplash.jpg +0 -0
inputs/engin-akyurt-aXVro7lQyUM-unsplash.jpg +0 -0
inputs/george-webster-p1VZ5IbT2Tg-unsplash.jpg +0 -0
inputs/hannah-pemberton-3d82e5_ylGo-unsplash.jpg +0 -0
inputs/mihaly-varga-AQFfdEY3X4Q-unsplash.jpg +0 -0
inputs/r-n-tyfqOL1FAQc-unsplash.jpg +0 -0
model/__init__.py +0 -0
model/pipeline_pops.py +553 -0
model/pops_utils.py +41 -0
pops.py +223 -0
requirements.txt +6 -0
style.css +55 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+from pops import PopsPipelines
+BLOCK_WIDTH = 250
+BLOCK_HEIGHT = 270
+FONT_SIZE = 3.5
+pops_pipelines = PopsPipelines()
+def run_equation_1(object_path, text, texture_path):
+    image = pops_pipelines.run_instruct_texture(object_path, text, texture_path)
+    return image
+def run_equation_2(object_path, texture_path, scene_path):
+    image = pops_pipelines.run_texture_scene(object_path, texture_path, scene_path)
+    return image
+with gr.Blocks(css='style.css') as demo:
+    gr.HTML('''<h1>p<span class="o-pops">O</span>ps: Photo-Inspired Diffusion <span class="o-operators">O</span>perators</h1>''')
+    gr.HTML('<div style="text-align: center;"><h3><a href="https://popspaper.github.io/pOps/">https://popspaper.github.io/pOps/</a></h3></div>')
+    gr.HTML(
+        '<div style="text-align: center;">Our method learns operators that are applied directly in the image embedding space, resulting in a variety of semantic operations that can then be realized as images using an image diffusion model.</div>')
+    with gr.Row(equal_height=True,elem_classes='justified-element'):
+        with gr.Column(scale=0,min_width=BLOCK_WIDTH):
+            object_path_eq_1 = gr.Image(label="Upload object image", type="filepath",width=BLOCK_WIDTH,height=BLOCK_HEIGHT)
+        with gr.Column(scale=0,min_width=50):
+            gr.HTML(f'''<div  style="justify-content: center; align-items: center;min-height:{BLOCK_HEIGHT}px"><span class="vertical-center" style="color:#82cf8e;font-size:{FONT_SIZE}rem;font-family:'Google Sans', sans-serif";>O</span></div>''')
+        with gr.Column(scale=0,min_width=200):
+            with gr.Group(elem_classes='instruct'):
+                text_eq_1 = gr.Textbox(value="",label="Enter adjective",max_lines=1,placeholder='e.g. melting, shiny, spiky',elem_classes='vertical-center')
+        with gr.Column(scale=0,min_width=50):
+            gr.HTML(f'''<div  style="justify-content: center; align-items: center;min-height:{BLOCK_HEIGHT}px"><span class="vertical-center" style="color:#efa241;font-size:{FONT_SIZE}rem;font-family:'Google Sans', sans-serif";>O</span></div>''')
+        with gr.Column(scale=0,min_width=BLOCK_WIDTH):
+            texture_path_eq_1 = gr.Image(label="Upload texture image", type="filepath",width=BLOCK_WIDTH,height=BLOCK_HEIGHT)
+        with gr.Column(scale=0,min_width=50):
+            gr.HTML(f'''<div  style="justify-content: center; align-items: center;min-height:{BLOCK_HEIGHT}px"><span class="vertical-center" style="color:#efa241;font-size:{FONT_SIZE}rem;font-family:'Google Sans', sans-serif";>=</span></div>''')
+        with gr.Column(scale=0,min_width=BLOCK_WIDTH):
+            output_eq_1 = gr.Image(label="Output",width=BLOCK_WIDTH,height=BLOCK_HEIGHT)
+    with gr.Row(equal_height=True, elem_classes='justified-element'):
+        run_button_eq_1 = gr.Button("Run Instruct and Texture Equation",elem_classes='small-elem')
+        run_button_eq_1.click(fn=run_equation_1,inputs=[object_path_eq_1, text_eq_1, texture_path_eq_1],outputs=[output_eq_1])
+    with gr.Row(equal_height=True, elem_classes='justified-element'):
+        pass
+    with gr.Row(equal_height=True,elem_classes='justified-element'):
+        with gr.Column(scale=0,min_width=BLOCK_WIDTH):
+            object_path_eq_2 = gr.Image(label="Upload object image", type="filepath",width=BLOCK_WIDTH,height=BLOCK_HEIGHT)
+        with gr.Column(scale=0,min_width=50):
+            gr.HTML(f'''<div  style="justify-content: center; align-items: center;min-height:{BLOCK_HEIGHT}px"><span class="vertical-center" style="color:#efa241;font-size:{FONT_SIZE}rem;font-family:'Google Sans', sans-serif";>O</span></div>''')
+        with gr.Column(scale=0,min_width=BLOCK_WIDTH):
+            texture_path_eq_2 = gr.Image(label="Upload texture image", type="filepath",width=BLOCK_WIDTH,height=BLOCK_HEIGHT)
+            # texture_path = gr.Image(label="Upload texture image", type="filepath",width=BLOCK_WIDTH,height=BLOCK_HEIGHT)
+        with gr.Column(scale=0,min_width=50):
+            gr.HTML(f'''<div  style="justify-content: center; align-items: center;min-height:{BLOCK_HEIGHT}px"><span class="vertical-center" style="color:#A085FF;font-size:{FONT_SIZE}rem;font-family:'Google Sans', sans-serif";>O</span></div>''')
+        with gr.Column(scale=0,min_width=BLOCK_WIDTH):
+            scene_path_eq_2 = gr.Image(label="Upload scene image", type="filepath",width=BLOCK_WIDTH,height=BLOCK_HEIGHT)
+        with gr.Column(scale=0,min_width=50):
+            gr.HTML(f'''<div  style="justify-content: center; align-items: center;min-height:{BLOCK_HEIGHT}px"><span class="vertical-center" style="color:#A085FF;font-size:{FONT_SIZE}rem;font-family:'Google Sans', sans-serif";>=</span></div>''')
+        with gr.Column(scale=0,min_width=BLOCK_WIDTH):
+            output_eq_2 = gr.Image(label="Output",width=BLOCK_WIDTH,height=BLOCK_HEIGHT)
+    with gr.Row(equal_height=True, elem_classes='justified-element'):
+        run_button_eq_2 = gr.Button("Run Texture and Scene Equation",elem_classes='small-elem')
+        run_button_eq_2.click(fn=run_equation_2,inputs=[object_path_eq_2, texture_path_eq_2, scene_path_eq_2],outputs=[output_eq_2])
+    with gr.Row(equal_height=True, elem_classes='justified-element'):
+        with gr.Column(scale=1):
+            examples = [
+                ['inputs/birmingham-museums-trust-q2OwlfXAYfo-unsplash.jpg', 'enormous',
+                 'inputs/mihaly-varga-AQFfdEY3X4Q-unsplash.jpg'],
+                ['inputs/r-n-tyfqOL1FAQc-unsplash.jpg', 'group', 'inputs/george-webster-p1VZ5IbT2Tg-unsplash.jpg'],
+            ]
+            gr.Examples(examples=examples,
+                        inputs=[object_path_eq_1, text_eq_1, texture_path_eq_1],
+                        outputs=[output_eq_1],
+                        fn=run_equation_1,
+                        cache_examples=False)
+            examples_2 = [
+                ['inputs/hannah-pemberton-3d82e5_ylGo-unsplash.jpg', 'inputs/engin-akyurt-aXVro7lQyUM-unsplash.jpg', 'inputs/alexandra-zelena-phskyemu_c4-unsplash.jpg'],
+            ]
+            gr.Examples(examples=examples_2,
+                        inputs=[object_path_eq_2, texture_path_eq_2, scene_path_eq_2],
+                        outputs=[output_eq_2],
+                        fn=run_equation_2,
+                        cache_examples=False)
+        with gr.Column(scale=1):
+            gr.HTML('''
+                    <div class="column">
+                    <h2 class="">🎶  Learn More  🎶</h2>
+                    <div class="">
+                              <div height="100%">
+                                <video src="https://github.com/pOpsPaper/pOps/raw/gh-pages/static/figures/teaser_video.mp4" controls ></video>
+                            </div>
+                    </div>
+                    <div class=""><small>
+                      Audio track for the teaser video was generated with the help of <a href="https://suno.com/">suno</a>.
+                    </small>
+                    </div>
+            ''')
+demo.queue().launch()

inputs/alexandra-zelena-phskyemu_c4-unsplash.jpg ADDED Viewed

inputs/birmingham-museums-trust-q2OwlfXAYfo-unsplash.jpg ADDED Viewed

inputs/engin-akyurt-aXVro7lQyUM-unsplash.jpg ADDED Viewed

inputs/george-webster-p1VZ5IbT2Tg-unsplash.jpg ADDED Viewed

inputs/hannah-pemberton-3d82e5_ylGo-unsplash.jpg ADDED Viewed

inputs/mihaly-varga-AQFfdEY3X4Q-unsplash.jpg ADDED Viewed

inputs/r-n-tyfqOL1FAQc-unsplash.jpg ADDED Viewed

model/__init__.py ADDED Viewed

File without changes

model/pipeline_pops.py ADDED Viewed

	@@ -0,0 +1,553 @@

+from typing import List, Optional, Union
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+from diffusers.models import PriorTransformer
+from diffusers.schedulers import UnCLIPScheduler
+from diffusers.utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.kandinsky import KandinskyPriorPipelineOutput
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
+        >>> import torch
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+        >>> pipe_prior.to("cuda")
+        >>> prompt = "red cat, 4k photo"
+        >>> image_emb, negative_image_emb = pipe_prior(prompt).to_tuple()
+        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images
+        >>> image[0].save("cat.png")
+        ```
+"""
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22PriorPipeline, KandinskyV22Pipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+        >>> import torch
+        >>> from torchvision import transforms
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> out = pipe_prior.interpolate(images_texts, weights)
+        >>> pipe = KandinskyV22Pipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=out.image_embeds,
+        ...     negative_image_embeds=out.negative_image_embeds,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images[0]
+        >>> image.save("starry_cat.png")
+        ```
+"""
+class pOpsPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+    _exclude_from_cpu_offload = ["prior"]
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+        Examples:
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+        device = device or self.device
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds.unsqueeze(0)
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                if isinstance(cond, PIL.Image.Image):
+                    cond = (
+                        self.image_processor(cond, return_tensors="pt")
+                        .pixel_values[0]
+                        .unsqueeze(0)
+                        .to(dtype=self.image_encoder.dtype, device=device)
+                    )
+                image_emb = self.image_encoder(cond)["image_embeds"].repeat(num_images_per_prompt, 1).unsqueeze(0)
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+            image_embeddings.append(image_emb * weight)
+        image_emb = torch.cat(image_embeddings).sum(dim=0)
+        out_zero = self(
+            negative_prompt,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            negative_prompt=negative_prior_prompt,
+            guidance_scale=guidance_scale,
+        )
+        zero_image_emb = out_zero.negative_image_embeds if negative_prompt == "" else out_zero.image_embeds
+        return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb)
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.get_zero_embed
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+            # done duplicates
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+        device = torch.device(f"cuda:{gpu_id}")
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.prior]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+        # We'll offload the last model manually.
+        self.prior_hook = hook
+        _, hook = cpu_offload_with_hook(self.image_encoder, device, prev_module_hook=self.prior_hook)
+        self.final_offload_hook = hook
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        input_embeds: torch.FloatTensor,
+        input_hidden_states: torch.FloatTensor,
+        negative_input_embeds: Optional[torch.FloatTensor] = None,
+        negative_input_hidden_states: Optional[torch.FloatTensor] = None,
+        input_mask: Optional[torch.FloatTensor]=None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 1.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            if negative_input_embeds is None or negative_input_hidden_states is None:
+                raise ValueError('negative_input_embeds and negative_input_hidden_states must be provided')
+        device = self._execution_device
+        batch_size = input_embeds.shape[0]
+        batch_size = batch_size * num_images_per_prompt
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            "", device, num_images_per_prompt, False, ""
+        )
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        prior_timesteps_tensor = self.scheduler.timesteps
+        embedding_dim = self.prior.config.embedding_dim
+        latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            # TODO: I can stop being dependent on the text encoder size
+            image_feat_seq = torch.zeros_like(text_encoder_hidden_states)
+            image_feat_seq[:, :input_hidden_states.shape[1]] = input_hidden_states
+            if input_mask is not None:
+                image_txt_mask = input_mask
+            else:
+                image_txt_mask = torch.zeros_like(text_mask)
+                image_txt_mask[:, :input_hidden_states.shape[1]] = 1
+            proj_embedding = input_embeds
+            if do_classifier_free_guidance:
+                neg_image_feat_seq = torch.zeros_like(text_encoder_hidden_states)
+                neg_image_feat_seq[:, :negative_input_hidden_states.shape[1]] = negative_input_hidden_states
+                if input_mask is not None:
+                    neg_image_txt_mask = input_mask
+                else:
+                    neg_image_txt_mask = torch.zeros_like(text_mask)
+                    neg_image_txt_mask[:, :negative_input_hidden_states.shape[1]] = 1
+                proj_embedding = torch.cat([negative_input_embeds, proj_embedding])
+                image_feat_seq = torch.cat([neg_image_feat_seq, image_feat_seq])
+                image_txt_mask = torch.cat([neg_image_txt_mask, image_txt_mask])
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=proj_embedding,
+                encoder_hidden_states=image_feat_seq,
+                attention_mask=image_txt_mask,
+            ).predicted_image_embedding
+            if do_classifier_free_guidance:
+                # print(f'Doing guidance with scale {guidance_scale}')
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+        latents = self.prior.post_process_latents(latents)
+        image_embeddings = latents
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        # if negative_prompt is None:
+        zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        # else:
+        #     image_embeddings, zero_embeds = image_embeddings.chunk(2)
+        #
+        #     if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        #         self.prior_hook.offload()
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)

model/pops_utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from typing import List, Tuple
+import torch
+from torch import nn
+def preprocess(image_a: torch.Tensor, image_b: torch.Tensor, image_encoder: nn.Module, clip_mean: torch.Tensor,
+            clip_std: torch.Tensor, should_drop_cond: List[Tuple[bool, bool]] = None, concat_hidden_states=None,
+            image_list=None):
+    with torch.no_grad():
+        image_list = [] if image_list is None else image_list
+        additional_list = []
+        if image_a is not None:
+            additional_list.append(image_a)
+        if image_b is not None:
+            additional_list.append(image_b)
+        image_list = additional_list + image_list
+        embeds_list = []
+        for image in image_list:
+            # If already is vector skip encoder
+            if len(image.shape) == 2:
+                image_embeds = image
+            else:
+                encoder_outs = image_encoder(image, output_hidden_states=False)
+                image_embeds = encoder_outs.image_embeds
+            image_embeds = (image_embeds - clip_mean) / clip_std
+            embeds_list.append(image_embeds.unsqueeze(1))
+        if should_drop_cond is not None:
+            for b_ind in range(embeds_list[0].shape[0]):
+                should_drop_a, should_drop_b = should_drop_cond[b_ind]
+                if should_drop_a:
+                    embeds_list[0][b_ind] = torch.zeros_like(embeds_list[0][b_ind])
+                if should_drop_b and image_b is not None:
+                    embeds_list[1][b_ind] = torch.zeros_like(embeds_list[1][b_ind])
+        if concat_hidden_states is not None:
+            embeds_list.append(concat_hidden_states)
+        out_hidden_states = torch.concat(embeds_list, dim=1)
+        image_embeds = torch.zeros_like(embeds_list[0].squeeze(1))
+    return image_embeds, out_hidden_states

pops.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import gradio as gr
+import torch
+from PIL import Image
+from diffusers import PriorTransformer, UNet2DConditionModel, KandinskyV22Pipeline
+from huggingface_hub import hf_hub_download
+from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, CLIPTokenizer, CLIPTextModelWithProjection
+from model import pops_utils
+from model.pipeline_pops import pOpsPipeline
+kandinsky_prior_repo: str = 'kandinsky-community/kandinsky-2-2-prior'
+kandinsky_decoder_repo: str = 'kandinsky-community/kandinsky-2-2-decoder'
+prior_texture_repo: str = 'models/texturing/learned_prior.pth'
+prior_instruct_repo: str = 'models/instruct/learned_prior.pth'
+prior_scene_repo: str = 'models/scene/learned_prior.pth'
+prior_repo = "pOpsPaper/operators"
+gpu = torch.device('cuda')
+cpu = torch.device('cpu')
+class PopsPipelines:
+    def __init__(self):
+        weight_dtype = torch.float16
+        self.weight_dtype = weight_dtype
+        device = 'cuda:0'
+        self.device = device
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(kandinsky_prior_repo,
+                                                                      subfolder='image_encoder',
+                                                                      torch_dtype=weight_dtype).eval()
+        self.image_encoder.requires_grad_(False)
+        self.image_processor = CLIPImageProcessor.from_pretrained(kandinsky_prior_repo,
+                                                             subfolder='image_processor')
+        self.tokenizer = CLIPTokenizer.from_pretrained(kandinsky_prior_repo, subfolder='tokenizer')
+        self.text_encoder = CLIPTextModelWithProjection.from_pretrained(kandinsky_prior_repo,
+                                                                   subfolder='text_encoder',
+                                                                   torch_dtype=weight_dtype).eval().to(device)
+        # Load full model for vis
+        self.unet = UNet2DConditionModel.from_pretrained(kandinsky_decoder_repo,
+                                                    subfolder='unet').to(torch.float16).to(device)
+        self.decoder = KandinskyV22Pipeline.from_pretrained(kandinsky_decoder_repo, unet=self.unet,
+                                                       torch_dtype=torch.float16)
+        self.decoder = self.decoder.to(device)
+        self.priors_dict = {
+            'texturing':{'repo':prior_texture_repo},
+            'instruct': {'repo': prior_instruct_repo},
+            'scene': {'repo':prior_scene_repo}
+        }
+        for prior_type in self.priors_dict:
+            prior_path = self.priors_dict[prior_type]['repo']
+            prior = PriorTransformer.from_pretrained(
+                kandinsky_prior_repo, subfolder="prior"
+            )
+            # Load from huggingface
+            prior_path = hf_hub_download(repo_id=prior_repo, filename=str(prior_path))
+            prior_state_dict = torch.load(prior_path, map_location=device)
+            prior.load_state_dict(prior_state_dict, strict=False)
+            prior.eval()
+            prior = prior.to(weight_dtype)
+            prior_pipeline = pOpsPipeline.from_pretrained(kandinsky_prior_repo,
+                                                          prior=prior,
+                                                          image_encoder=self.image_encoder,
+                                                          torch_dtype=torch.float16)
+            self.priors_dict[prior_type]['pipeline'] = prior_pipeline
+    def process_image(self, input_path):
+        if input_path is None:
+            return None
+        image_pil = Image.open(input_path).convert("RGB").resize((512, 512))
+        image = torch.Tensor(self.image_processor(image_pil)['pixel_values'][0]).to(self.device).unsqueeze(0).to(
+            self.weight_dtype)
+        return image
+    def process_text(self, text):
+        text_inputs = self.tokenizer(
+            text,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        mask = text_inputs.attention_mask.bool()  # [0]
+        text_encoder_output = self.text_encoder(text_inputs.input_ids.to(self.device))
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+        text_encoder_concat = text_encoder_hidden_states[:, :mask.sum().item()]
+        return text_encoder_concat
+    def run_binary(self, input_a, input_b, prior_type):
+        # Move pipeline to GPU
+        pipeline = self.priors_dict[prior_type]['pipeline']
+        pipeline.to('cuda')
+        input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, input_b,
+                                                                       self.image_encoder,
+                                                                       pipeline.prior.clip_mean.detach(),
+                                                                       pipeline.prior.clip_std.detach())
+        negative_input_embeds = torch.zeros_like(input_image_embeds)
+        negative_hidden_states = torch.zeros_like(input_hidden_state)
+        guidance_scale = 1.0
+        if prior_type == 'texturing':
+            guidance_scale = 8.0
+        img_emb = pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
+                                    negative_input_embeds=negative_input_embeds,
+                                    negative_input_hidden_states=negative_hidden_states,
+                                    num_inference_steps=25,
+                                    num_images_per_prompt=1,
+                                    guidance_scale=guidance_scale)
+        # Optional
+        if prior_type == 'scene':
+            # Scene is the closet to what avg represents for a background image so incorporate that as well
+            mean_emb = 0.5 * input_hidden_state[:, 0] + 0.5 * input_hidden_state[:, 1]
+            mean_emb = (mean_emb * pipeline.prior.clip_std) + pipeline.prior.clip_mean
+            alpha = 0.4
+            img_emb.image_embeds = (1 - alpha) * img_emb.image_embeds + alpha * mean_emb
+        # Move pipeline to CPU
+        pipeline.to('cpu')
+        return img_emb
+    def run_instruct(self, input_a, text):
+        text_encodings = self.process_text(text)
+        # Move pipeline to GPU
+        instruct_pipeline = self.priors_dict['instruct']['pipeline']
+        instruct_pipeline.to('cuda')
+        input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, None,
+                                                           self.image_encoder,
+                                                           instruct_pipeline.prior.clip_mean.detach(), instruct_pipeline.prior.clip_std.detach(),
+                                                           concat_hidden_states=text_encodings)
+        negative_input_embeds = torch.zeros_like(input_image_embeds)
+        negative_hidden_states = torch.zeros_like(input_hidden_state)
+        img_emb = instruct_pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
+                                 negative_input_embeds=negative_input_embeds,
+                                 negative_input_hidden_states=negative_hidden_states,
+                                 num_inference_steps=25,
+                                 num_images_per_prompt=1,
+                                 guidance_scale=1.0)
+        # Move pipeline to CPU
+        instruct_pipeline.to('cpu')
+        return img_emb
+    def render(self, img_emb):
+        images = self.decoder(image_embeds=img_emb.image_embeds, negative_image_embeds=img_emb.negative_image_embeds,
+                         num_inference_steps=50, height=512,
+                         width=512, guidance_scale=4).images
+        return images[0]
+    def run_instruct_texture(self, image_object_path, text_instruct, image_texture_path):
+        # Process both inputs
+        image_object = self.process_image(image_object_path)
+        image_texture = self.process_image(image_texture_path)
+        if image_object is None:
+            raise gr.Error('Object image is required')
+        current_emb = None
+        if image_texture is None:
+            instruct_input = image_object
+        else:
+            # Run texturing
+            current_emb = self.run_binary(input_a=image_object, input_b=image_texture,prior_type='texturing')
+            instruct_input = current_emb.image_embeds
+        if text_instruct != '':
+            current_emb = self.run_instruct(input_a=instruct_input, text=text_instruct)
+        if current_emb is None:
+            raise gr.Error('At least one of the inputs is required')
+        # Render as image
+        image = self.render(current_emb)
+        return image
+    def run_texture_scene(self, image_object_path, image_texture_path, image_scene_path):
+        # Process both inputs
+        image_object = self.process_image(image_object_path)
+        image_texture = self.process_image(image_texture_path)
+        image_scene = self.process_image(image_scene_path)
+        if image_object is None:
+            raise gr.Error('Object image is required')
+        current_emb = None
+        if image_texture is None:
+            scene_input = image_object
+        else:
+            # Run texturing
+            current_emb = self.run_binary(input_a=image_object, input_b=image_scene,prior_type='scene')
+            scene_input = current_emb.image_embeds
+        # Run scene
+        if image_scene is not None:
+            current_emb = self.run_binary(input_a=scene_input, input_b=image_texture,prior_type='texturing')
+        if current_emb is None:
+            raise gr.Error('At least one of the images is required')
+        # Render as image
+        image = self.render(current_emb)
+        return image

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+diffusers
+transformers
+Pillow
+accelerate
+torch
+torchvision

style.css ADDED Viewed

	@@ -0,0 +1,55 @@

+h1, h2, h3 {
+  text-align: center;
+  margin: 0;
+}
+.vertical-center {
+  margin: 0;
+  position: absolute;
+  top: 50%;
+  -ms-transform: translateY(-50%);
+  transform: translateY(-50%);
+}
+.instruct {
+  min-height: 250px;
+  background-color: transparent;
+  border: transparent;
+}
+#component-0{
+  justify-content: center;
+  align-items: center;
+}
+#component-2{
+  justify-content: center;
+  align-items: center;
+}
+/*#component-3{*/
+/*  justify-content: center;*/
+/*  align-items: center;*/
+/*}*/
+.justified-element {
+  /*display: flex;*/
+  justify-content: center;
+  align-items: center;
+}
+.small-elem {
+  max-width: 400px;
+}
+.o-pops {
+  color: #82cf8e; /* Light green color */
+  font-weight: bold;
+}
+.o-operators {
+  color: #ac85cc; /* Light purple color */
+  font-weight: bold;
+}