Spaces:

Salesforce
/

BLIP2

Running

App Files Files Community

bramw commited on Mar 22, 2023

Commit

45d0452

1 Parent(s): 64aea86

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -243

app.py CHANGED Viewed

@@ -1,285 +1,139 @@
-from io import BytesIO
-import string
 import gradio as gr
-import requests
 from utils import Endpoint, get_token
 def encode_image(image):
     buffered = BytesIO()
-    image.save(buffered, format="JPEG")
     buffered.seek(0)
     return buffered
-def query_chat_api(
-    image, prompt, decoding_method, temperature, len_penalty, repetition_penalty
-):
-    url = endpoint.url
-    url = url + "/api/generate"
-    headers = {
-        "User-Agent": "BLIP-2 HuggingFace Space",
-        "Auth-Token": get_token(),
-    }
-    data = {
-        "prompt": prompt,
-        "use_nucleus_sampling": decoding_method == "Nucleus sampling",
-        "temperature": temperature,
-        "length_penalty": len_penalty,
-        "repetition_penalty": repetition_penalty,
-    }
-    image = encode_image(image)
-    files = {"image": image}
-    response = requests.post(url, data=data, files=files, headers=headers)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        return "Error: " + response.text
-def query_caption_api(
-    image, decoding_method, temperature, len_penalty, repetition_penalty
-):
     url = endpoint.url
-    url = url + "/api/caption"
-    headers = {
-        "User-Agent": "BLIP-2 HuggingFace Space",
         "Auth-Token": get_token(),
     }
     data = {
-        "use_nucleus_sampling": decoding_method == "Nucleus sampling",
-        "temperature": temperature,
-        "length_penalty": len_penalty,
-        "repetition_penalty": repetition_penalty,
     }
-    image = encode_image(image)
-    files = {"image": image}
     response = requests.post(url, data=data, files=files, headers=headers)
     if response.status_code == 200:
-        return response.json()
     else:
-        return "Error: " + response.text
-def postprocess_output(output):
-    # if last character is not a punctuation, add a full stop
-    if not output[0][-1] in string.punctuation:
-        output[0] += "."
-    return output
-def inference_chat(
-    image,
-    text_input,
-    decoding_method,
-    temperature,
-    length_penalty,
-    repetition_penalty,
-    history=[],
-):
-    text_input = text_input
-    history.append(text_input)
-    prompt = " ".join(history)
-    output = query_chat_api(
-        image, prompt, decoding_method, temperature, length_penalty, repetition_penalty
-    )
-    output = postprocess_output(output)
-    history += output
-    chat = [
-        (history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)
-    ]  # convert to tuples of list
-    return {chatbot: chat, state: history}
-def inference_caption(
-    image,
-    decoding_method,
-    temperature,
-    length_penalty,
-    repetition_penalty,
-):
-    output = query_caption_api(
-        image, decoding_method, temperature, length_penalty, repetition_penalty
-    )
-    return output[0]
-title = """<h1 align="center">BLIP-2</h1>"""
-description = """Gradio demo for BLIP-2, image-to-text generation from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them.
-<br> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected."""
-article = """<strong>Paper</strong>: <a href='https://arxiv.org/abs/2301.12597' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
-<br> <strong>Code</strong>: BLIP2 is now integrated into GitHub repo: <a href='https://github.com/salesforce/LAVIS' target='_blank'>LAVIS: a One-stop Library for Language and Vision</a>
-<br> <strong>🤗 `transformers` integration</strong>: You can now use `transformers` to use our BLIP-2 models! Check out the <a href='https://huggingface.co/docs/transformers/main/en/model_doc/blip-2' target='_blank'> official docs </a>
-<p> <strong>Project Page</strong>: <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'> BLIP2 on LAVIS</a>
-<br> <strong>Description</strong>: Captioning results from <strong>BLIP2_OPT_6.7B</strong>. Chat results from <strong>BLIP2_FlanT5xxl</strong>.
-<p><strong>For safety and ethical considerations, we have disabled image uploading from March 21. 2023. </strong>
-<p><strong>Please try examples provided below.</strong>
-"""
-endpoint = Endpoint()
 examples = [
-    ["house.png", "How could someone get out of the house?"],
-    ["flower.jpg", "Question: What is this flower and where is it's origin? Answer:"],
-    ["pizza.jpg", "What are steps to cook it?"],
-    ["sunset.jpg", "Here is a romantic message going along the photo:"],
-    ["forbidden_city.webp", "In what dynasties was this place built?"],
-]
-with gr.Blocks(
-    css="""
-    .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
-    #component-21 > div.wrap.svelte-w6rprc {height: 600px;}
-    """
-) as iface:
-    state = gr.State([])
-    gr.Markdown(title)
-    gr.Markdown(description)
-    gr.Markdown(article)
-    with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", interactive=False)
-            # with gr.Row():
-            sampling = gr.Radio(
-                choices=["Beam search", "Nucleus sampling"],
-                value="Beam search",
-                label="Text Decoding Method",
-                interactive=True,
-            )
-            temperature = gr.Slider(
-                minimum=0.5,
-                maximum=1.0,
-                value=1.0,
-                step=0.1,
-                interactive=True,
-                label="Temperature (used with nucleus sampling)",
-            )
-            len_penalty = gr.Slider(
-                minimum=-1.0,
-                maximum=2.0,
-                value=1.0,
-                step=0.2,
-                interactive=True,
-                label="Length Penalty (set to larger for longer sequence, used with beam search)",
-            )
-            rep_penalty = gr.Slider(
-                minimum=1.0,
-                maximum=5.0,
-                value=1.5,
-                step=0.5,
-                interactive=True,
-                label="Repeat Penalty (larger value prevents repetition)",
-            )
-        with gr.Column(scale=1.8):
-            with gr.Column():
-                caption_output = gr.Textbox(lines=1, label="Caption Output")
-                caption_button = gr.Button(
-                    value="Caption it!", interactive=True, variant="primary"
-                )
-                caption_button.click(
-                    inference_caption,
-                    [
-                        image_input,
-                        sampling,
-                        temperature,
-                        len_penalty,
-                        rep_penalty,
-                    ],
-                    [caption_output],
-                )
-            gr.Markdown("""Trying prompting your input for chat; e.g. example prompt for QA, \"Question: {} Answer:\" Use proper punctuation (e.g., question mark).""")
-            with gr.Row():
-                with gr.Column(
-                    scale=1.5,
-                ):
-                    chatbot = gr.Chatbot(
-                        label="Chat Output (from FlanT5)",
-                    )
-                # with gr.Row():
-                with gr.Column(scale=1):
-                    chat_input = gr.Textbox(lines=1, label="Chat Input")
-                    chat_input.submit(
-                        inference_chat,
-                        [
-                            image_input,
-                            chat_input,
-                            sampling,
-                            temperature,
-                            len_penalty,
-                            rep_penalty,
-                            state,
-                        ],
-                        [chatbot, state],
-                    )
-                    with gr.Row():
-                        clear_button = gr.Button(value="Clear", interactive=True)
-                        clear_button.click(
-                            lambda: ("", [], []),
-                            [],
-                            [chat_input, chatbot, state],
-                            queue=False,
-                        )
-                        submit_button = gr.Button(
-                            value="Submit", interactive=True, variant="primary"
-                        )
-                        submit_button.click(
-                            inference_chat,
-                            [
-                                image_input,
-                                chat_input,
-                                sampling,
-                                temperature,
-                                len_penalty,
-                                rep_penalty,
-                                state,
-                            ],
-                            [chatbot, state],
-                        )
-            image_input.change(
-                lambda: ("", "", []),
-                [],
-                [chatbot, caption_output, state],
-                queue=False,
-            )
-    examples = gr.Examples(
-        examples=examples,
-        inputs=[image_input, chat_input],
-    )
-iface.queue(concurrency_count=1, api_open=False, max_size=10)
-iface.launch(enable_queue=True)

 import gradio as gr
+import numpy as np
+# from edict_functions import EDICT_editing
+from PIL import Image
 from utils import Endpoint, get_token
+from io import BytesIO
+import requests
+endpoint = Endpoint()
+def local_edict(x, source_text, edit_text,
+         edit_strength, guidance_scale,
+          steps=50, mix_weight=0.93, ):
+    x = Image.fromarray(x)
+    return_im =  EDICT_editing(x,
+                         source_text,
+                         edit_text,
+                  steps=steps,
+                  mix_weight=mix_weight,
+                  init_image_strength=edit_strength,
+                  guidance_scale=guidance_scale
+                              )[0]
+    return np.array(return_im)
 def encode_image(image):
     buffered = BytesIO()
+    image.save(buffered, format="JPEG", quality=95)
     buffered.seek(0)
     return buffered
+def decode_image(img_obj):
+    img = Image.open(img_obj).convert("RGB")
+    return img
+def edict(x, source_text, edit_text,
+         edit_strength, guidance_scale,
+          steps=50, mix_weight=0.93, ):
     url = endpoint.url
+    url = url + "/api/edit"
+    headers = {### Misc.
+        "User-Agent": "EDICT HuggingFace Space",
         "Auth-Token": get_token(),
     }
     data = {
+        "source_text": source_text,
+        "edit_text": edit_text,
+        "edit_strength": edit_strength,
+        "guidance_scale": guidance_scale,
     }
+    image = encode_image(Image.fromarray(x))
+    files = {"image": image}
     response = requests.post(url, data=data, files=files, headers=headers)
     if response.status_code == 200:
+        return np.array(decode_image(BytesIO(response.content)))
     else:
+        return "Error: " + response.text
+    # x = decode_image(response)
+    # return np.array(x)
 examples = [
+        ['square_ims/american_gothic.jpg', 'A painting of two people frowning', 'A painting of two people smiling', 0.5, 3],
+        ['square_ims/colloseum.jpg', 'An old ruined building', 'A new modern office building', 0.8, 3],
+    ]
+examples.append(['square_ims/scream.jpg', 'A painting of someone screaming', 'A painting of an alien', 0.5, 3])
+examples.append(['square_ims/yosemite.jpg', 'Granite forest valley', 'Granite desert valley', 0.8, 3])
+examples.append(['square_ims/einstein.jpg', 'Mouth open', 'Mouth closed', 0.8, 3])
+examples.append(['square_ims/einstein.jpg', 'A man', 'A man in K.I.S.S. facepaint', 0.8, 3])
+"""
+examples.extend([
+        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Chinese New Year cupcake', 0.8, 3],
+        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Union Jack cupcake', 0.8, 3],
+        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Nigerian flag cupcake', 0.8, 3],
+        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Santa Claus cupcake', 0.8, 3],
+        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'An Easter cupcake', 0.8, 3],
+        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A hedgehog cupcake', 0.8, 3],
+        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A rose cupcake', 0.8, 3],
+    ])
+"""
+for dog_i in [1, 2]:
+    for breed in ['Golden Retriever', 'Chihuahua', 'Dalmatian']:
+        examples.append([f'square_ims/imagenet_dog_{dog_i}.jpg', 'A dog', f'A {breed}', 0.8, 3])
+description = '**For safety and ethical considerations, we have disabled image uploading from March 21. 2023.\nPlease try examples provided below.**\nA gradio demo for [EDICT](https://arxiv.org/abs/2211.12446) (CVPR23)'
+# description = gr.Markdown(description)
+article = """
+### Prompting Style
+As with many text-to-image methods, the prompting style of EDICT can make a big difference. When in doubt, experiment! Some guidance:
+* Parallel *Original Description* and *Edit Description* construction as much as possible. Inserting/editing single words often is enough to affect a change while maintaining a lot of the original structure
+* Words that will affect the entire setting (e.g. "A photo of " vs. "A painting of") can make a big difference. Playing around with them can help a lot
+### Parameters
+Both `edit_strength` and `guidance_scale` have similar properties qualitatively: the higher the value the more the image will change. We suggest
+* Increasing/decreasing `edit_strength` first, particularly to alter/preserve more of the original structure/content
+* Then changing `guidance_scale` to make the change in the edited region more or less pronounced.
+Usually we find changing `edit_strength` to be enough, but feel free to play around (and report any interesting results)!
+### Misc.
+Having difficulty coming up with a caption? Try [BLIP](https://huggingface.co/spaces/Salesforce/BLIP2) to automatically generate one!
+As with most StableDiffusion approaches, faces/text are often problematic to render, especially if they're small. Having these in the foreground will help keep them cleaner.
+A returned black image means that the [Safety Checker](https://huggingface.co/CompVis/stable-diffusion-safety-checker) triggered on the photo. This happens in odd cases sometimes (it often rejects
+the huggingface logo or variations), but we need to keep it in for obvious reasons.
+"""
+# article = gr.Markdown(description)
+iface = gr.Interface(fn=edict, inputs=[gr.Image(interactive=False),
+                                       gr.Textbox(label="Original Description"),
+                                       gr.Textbox(label="Edit Description"),
+                                       # 50, # gr.Slider(5, 50, value=20, step=1),
+                                       # 0.93, # gr.Slider(0.5, 1, value=0.7, step=0.05),
+                                       gr.Slider(0.0, 1, value=0.8, step=0.05),
+                                       gr.Slider(0, 10, value=3, step=0.5),
+                                      ],
+                     examples = examples,
+                     outputs="image",
+                     description=description,
+                     article=article,
+                     cache_examples=True)
+iface.launch()