Spaces:

Salesforce
/

BLIP2

Running

App Files Files Community

bramw commited on Mar 22, 2023

Commit

3638e5c

1 Parent(s): ddb5bc5

Fixing accidental edit

Browse files

My bad, clicked into wrong space to edit the app

Files changed (1) hide show

app.py +242 -97

app.py CHANGED Viewed

@@ -1,140 +1,285 @@
-import gradio as gr
-import numpy as np
-# from edict_functions import EDICT_editing
-from PIL import Image
-from utils import Endpoint, get_token
 from io import BytesIO
-import requests
-endpoint = Endpoint()
-def local_edict(x, source_text, edit_text,
-         edit_strength, guidance_scale,
-          steps=50, mix_weight=0.93, ):
-    x = Image.fromarray(x)
-    return_im =  EDICT_editing(x,
-                         source_text,
-                         edit_text,
-                  steps=steps,
-                  mix_weight=mix_weight,
-                  init_image_strength=edit_strength,
-                  guidance_scale=guidance_scale
-                              )[0]
-    return np.array(return_im)
 def encode_image(image):
     buffered = BytesIO()
-    image.save(buffered, format="JPEG", quality=95)
     buffered.seek(0)
     return buffered
-def decode_image(img_obj):
-    img = Image.open(img_obj).convert("RGB")
-    return img
-def edict(x, source_text, edit_text,
-         edit_strength, guidance_scale,
-          steps=50, mix_weight=0.93, ):
     url = endpoint.url
-    url = url + "/api/edit"
-    headers = {### Misc.
-        "User-Agent": "EDICT HuggingFace Space",
         "Auth-Token": get_token(),
     }
     data = {
-        "source_text": source_text,
-        "edit_text": edit_text,
-        "edit_strength": edit_strength,
-        "guidance_scale": guidance_scale,
     }
-    image = encode_image(Image.fromarray(x))
-    files = {"image": image}
     response = requests.post(url, data=data, files=files, headers=headers)
     if response.status_code == 200:
-        return np.array(decode_image(BytesIO(response.content)))
     else:
-        return "Error: " + response.text
-    # x = decode_image(response)
-    # return np.array(x)
-examples = [
-        ['square_ims/american_gothic.jpg', 'A painting of two people frowning', 'A painting of two people smiling', 0.5, 3],
-        ['square_ims/colloseum.jpg', 'An old ruined building', 'A new modern office building', 0.8, 3],
-    ]
-examples.append(['square_ims/scream.jpg', 'A painting of someone screaming', 'A painting of an alien', 0.5, 3])
-examples.append(['square_ims/yosemite.jpg', 'Granite forest valley', 'Granite desert valley', 0.8, 3])
-examples.append(['square_ims/einstein.jpg', 'Mouth open', 'Mouth closed', 0.8, 3])
-examples.append(['square_ims/einstein.jpg', 'A man', 'A man in K.I.S.S. facepaint', 0.8, 3])
-"""
-examples.extend([
-        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Chinese New Year cupcake', 0.8, 3],
-        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Union Jack cupcake', 0.8, 3],
-        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Nigerian flag cupcake', 0.8, 3],
-        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A Santa Claus cupcake', 0.8, 3],
-        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'An Easter cupcake', 0.8, 3],
-        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A hedgehog cupcake', 0.8, 3],
-        ['square_ims/imagenet_cake_2.jpg', 'A cupcake', 'A rose cupcake', 0.8, 3],
-    ])
-"""
-for dog_i in [1, 2]:
-    for breed in ['Golden Retriever', 'Chihuahua', 'Dalmatian']:
-        examples.append([f'square_ims/imagenet_dog_{dog_i}.jpg', 'A dog', f'A {breed}', 0.8, 3])
-description = '**For safety and ethical considerations, we have disabled image uploading from March 21. 2023.\nPlease try examples provided below.**\nA gradio demo for [EDICT](https://arxiv.org/abs/2211.12446) (CVPR23)'
-# description = gr.Markdown(description)
-article = """
-### Prompting Style
-As with many text-to-image methods, the prompting style of EDICT can make a big difference. When in doubt, experiment! Some guidance:
-* Parallel *Original Description* and *Edit Description* construction as much as possible. Inserting/editing single words often is enough to affect a change while maintaining a lot of the original structure
-* Words that will affect the entire setting (e.g. "A photo of " vs. "A painting of") can make a big difference. Playing around with them can help a lot
-### Parameters
-Both `edit_strength` and `guidance_scale` have similar properties qualitatively: the higher the value the more the image will change. We suggest
-* Increasing/decreasing `edit_strength` first, particularly to alter/preserve more of the original structure/content
-* Then changing `guidance_scale` to make the change in the edited region more or less pronounced.
-Usually we find changing `edit_strength` to be enough, but feel free to play around (and report any interesting results)!
-### Misc.
-Having difficulty coming up with a caption? Try [BLIP](https://huggingface.co/spaces/Salesforce/BLIP2) to automatically generate one!
-As with most StableDiffusion approaches, faces/text are often problematic to render, especially if they're small. Having these in the foreground will help keep them cleaner.
-A returned black image means that the [Safety Checker](https://huggingface.co/CompVis/stable-diffusion-safety-checker) triggered on the photo. This happens in odd cases sometimes (it often rejects
-the huggingface logo or variations), but we need to keep it in for obvious reasons.
 """
-# article = gr.Markdown(description)
-iface = gr.Interface(fn=edict, inputs=[gr.Image(interactive=False),
-                                       gr.Textbox(label="Original Description"),
-                                       gr.Textbox(label="Edit Description"),
-                                       # 50, # gr.Slider(5, 50, value=20, step=1),
-                                       # 0.93, # gr.Slider(0.5, 1, value=0.7, step=0.05),
-                                       gr.Slider(0.0, 1, value=0.8, step=0.05),
-                                       gr.Slider(0, 10, value=3, step=0.5),
-                                      ],
-                     # examples = examples,
-                     outputs="image",
-                     description=description,
-                     article=article,
-                     #cache_examples=True
                     )
-iface.launch()

 from io import BytesIO
+import string
+import gradio as gr
+import requests
+from utils import Endpoint, get_token
 def encode_image(image):
     buffered = BytesIO()
+    image.save(buffered, format="JPEG")
     buffered.seek(0)
     return buffered
+def query_chat_api(
+    image, prompt, decoding_method, temperature, len_penalty, repetition_penalty
+):
+    url = endpoint.url
+    url = url + "/api/generate"
+    headers = {
+        "User-Agent": "BLIP-2 HuggingFace Space",
+        "Auth-Token": get_token(),
+    }
+    data = {
+        "prompt": prompt,
+        "use_nucleus_sampling": decoding_method == "Nucleus sampling",
+        "temperature": temperature,
+        "length_penalty": len_penalty,
+        "repetition_penalty": repetition_penalty,
+    }
+    image = encode_image(image)
+    files = {"image": image}
+    response = requests.post(url, data=data, files=files, headers=headers)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        return "Error: " + response.text
+def query_caption_api(
+    image, decoding_method, temperature, len_penalty, repetition_penalty
+):
     url = endpoint.url
+    url = url + "/api/caption"
+    headers = {
+        "User-Agent": "BLIP-2 HuggingFace Space",
         "Auth-Token": get_token(),
     }
     data = {
+        "use_nucleus_sampling": decoding_method == "Nucleus sampling",
+        "temperature": temperature,
+        "length_penalty": len_penalty,
+        "repetition_penalty": repetition_penalty,
     }
+    image = encode_image(image)
+    files = {"image": image}
     response = requests.post(url, data=data, files=files, headers=headers)
     if response.status_code == 200:
+        return response.json()
     else:
+        return "Error: " + response.text
+def postprocess_output(output):
+    # if last character is not a punctuation, add a full stop
+    if not output[0][-1] in string.punctuation:
+        output[0] += "."
+    return output
+def inference_chat(
+    image,
+    text_input,
+    decoding_method,
+    temperature,
+    length_penalty,
+    repetition_penalty,
+    history=[],
+):
+    text_input = text_input
+    history.append(text_input)
+    prompt = " ".join(history)
+    output = query_chat_api(
+        image, prompt, decoding_method, temperature, length_penalty, repetition_penalty
+    )
+    output = postprocess_output(output)
+    history += output
+    chat = [
+        (history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)
+    ]  # convert to tuples of list
+    return {chatbot: chat, state: history}
+def inference_caption(
+    image,
+    decoding_method,
+    temperature,
+    length_penalty,
+    repetition_penalty,
+):
+    output = query_caption_api(
+        image, decoding_method, temperature, length_penalty, repetition_penalty
+    )
+    return output[0]
+title = """<h1 align="center">BLIP-2</h1>"""
+description = """Gradio demo for BLIP-2, image-to-text generation from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them.
+<br> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected."""
+article = """<strong>Paper</strong>: <a href='https://arxiv.org/abs/2301.12597' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
+<br> <strong>Code</strong>: BLIP2 is now integrated into GitHub repo: <a href='https://github.com/salesforce/LAVIS' target='_blank'>LAVIS: a One-stop Library for Language and Vision</a>
+<br> <strong>🤗 `transformers` integration</strong>: You can now use `transformers` to use our BLIP-2 models! Check out the <a href='https://huggingface.co/docs/transformers/main/en/model_doc/blip-2' target='_blank'> official docs </a>
+<p> <strong>Project Page</strong>: <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'> BLIP2 on LAVIS</a>
+<br> <strong>Description</strong>: Captioning results from <strong>BLIP2_OPT_6.7B</strong>. Chat results from <strong>BLIP2_FlanT5xxl</strong>.
+<h2><strong>Due to ethical concerns, we have disabled image uploading from March 21. 2023. </strong>
+<h2><strong>Please try examples provided below.</strong>
 """
+endpoint = Endpoint()
+examples = [
+    ["house.png", "How could someone get out of the house?"],
+    ["flower.jpg", "Question: What is this flower and where is it's origin? Answer:"],
+    ["pizza.jpg", "What are steps to cook it?"],
+    ["sunset.jpg", "Here is a romantic message going along the photo:"],
+    ["forbidden_city.webp", "In what dynasties was this place built?"],
+]
+with gr.Blocks(
+    css="""
+    .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
+    #component-21 > div.wrap.svelte-w6rprc {height: 600px;}
+    """
+) as iface:
+    state = gr.State([])
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.Markdown(article)
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", interactive=False)
+            # with gr.Row():
+            sampling = gr.Radio(
+                choices=["Beam search", "Nucleus sampling"],
+                value="Beam search",
+                label="Text Decoding Method",
+                interactive=True,
+            )
+            temperature = gr.Slider(
+                minimum=0.5,
+                maximum=1.0,
+                value=1.0,
+                step=0.1,
+                interactive=True,
+                label="Temperature (used with nucleus sampling)",
+            )
+            len_penalty = gr.Slider(
+                minimum=-1.0,
+                maximum=2.0,
+                value=1.0,
+                step=0.2,
+                interactive=True,
+                label="Length Penalty (set to larger for longer sequence, used with beam search)",
+            )
+            rep_penalty = gr.Slider(
+                minimum=1.0,
+                maximum=5.0,
+                value=1.5,
+                step=0.5,
+                interactive=True,
+                label="Repeat Penalty (larger value prevents repetition)",
+            )
+        with gr.Column(scale=1.8):
+            with gr.Column():
+                caption_output = gr.Textbox(lines=1, label="Caption Output")
+                caption_button = gr.Button(
+                    value="Caption it!", interactive=True, variant="primary"
+                )
+                caption_button.click(
+                    inference_caption,
+                    [
+                        image_input,
+                        sampling,
+                        temperature,
+                        len_penalty,
+                        rep_penalty,
+                    ],
+                    [caption_output],
+                )
+            gr.Markdown("""Trying prompting your input for chat; e.g. example prompt for QA, \"Question: {} Answer:\" Use proper punctuation (e.g., question mark).""")
+            with gr.Row():
+                with gr.Column(
+                    scale=1.5,
+                ):
+                    chatbot = gr.Chatbot(
+                        label="Chat Output (from FlanT5)",
+                    )
+                # with gr.Row():
+                with gr.Column(scale=1):
+                    chat_input = gr.Textbox(lines=1, label="Chat Input")
+                    chat_input.submit(
+                        inference_chat,
+                        [
+                            image_input,
+                            chat_input,
+                            sampling,
+                            temperature,
+                            len_penalty,
+                            rep_penalty,
+                            state,
+                        ],
+                        [chatbot, state],
                     )
+                    with gr.Row():
+                        clear_button = gr.Button(value="Clear", interactive=True)
+                        clear_button.click(
+                            lambda: ("", [], []),
+                            [],
+                            [chat_input, chatbot, state],
+                            queue=False,
+                        )
+                        submit_button = gr.Button(
+                            value="Submit", interactive=True, variant="primary"
+                        )
+                        submit_button.click(
+                            inference_chat,
+                            [
+                                image_input,
+                                chat_input,
+                                sampling,
+                                temperature,
+                                len_penalty,
+                                rep_penalty,
+                                state,
+                            ],
+                            [chatbot, state],
+                        )
+            image_input.change(
+                lambda: ("", "", []),
+                [],
+                [chatbot, caption_output, state],
+                queue=False,
+            )
+    examples = gr.Examples(
+        examples=examples,
+        inputs=[image_input, chat_input],
+    )
+iface.queue(concurrency_count=1, api_open=False, max_size=10)
+iface.launch(enable_queue=True)