Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,83 +1,118 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import pipeline
|
| 3 |
-
import
|
| 4 |
-
import spaces
|
| 5 |
|
| 6 |
-
#
|
| 7 |
-
|
| 8 |
-
"
|
| 9 |
-
"
|
| 10 |
-
"
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
def
|
| 15 |
-
""
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
)
|
| 28 |
-
|
| 29 |
-
#
|
| 30 |
-
# Here, we check an environment variable "USE_GPU" (set it to "1" in your Space's settings if needed).
|
| 31 |
device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1
|
| 32 |
|
| 33 |
-
# Create pipelines
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
# Run inference
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
# Build the Gradio interface
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
with gr.Row():
|
| 51 |
-
with gr.Column():
|
| 52 |
-
gr.Markdown("
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
with gr.Row():
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
fn=
|
| 79 |
-
inputs=[image_input, model1_choice, model1_custom, model2_choice, model2_custom],
|
| 80 |
-
outputs=[
|
| 81 |
)
|
| 82 |
|
| 83 |
-
demo.launch()
|
|
|
|
| 1 |
+
import os
|
| 2 |
import gradio as gr
|
| 3 |
from transformers import pipeline
|
| 4 |
+
import spaces # This module is available when deploying on HF Spaces with ZeroGPU
|
|
|
|
| 5 |
|
| 6 |
+
# --- Trending models for image text-to-text tasks ---
|
| 7 |
+
TRENDING_MODELS = [
|
| 8 |
+
"Salesforce/blip2-opt-2.7b",
|
| 9 |
+
"Salesforce/blip2-flan-t5-xl",
|
| 10 |
+
"Salesforce/blip-image-captioning-base",
|
| 11 |
+
"Salesforce/blip-image-captioning-large",
|
| 12 |
+
"nlpconnect/vit-gpt2-image-captioning",
|
| 13 |
+
"OFA-Sys/OFA-base",
|
| 14 |
+
"OFA-Sys/OFA-large",
|
| 15 |
+
"dandelin/vilt-b32-finetuned-vqa",
|
| 16 |
+
"dandelin/vilt-b32-mlm",
|
| 17 |
+
"uclanlp/visualbert-vqa-coco-pre"
|
| 18 |
+
]
|
| 19 |
|
| 20 |
+
# --- Helper: if the user selects "Custom", then they can enter any model identifier ---
|
| 21 |
+
def resolve_model(chosen, custom):
|
| 22 |
+
if chosen == "Custom":
|
| 23 |
+
return custom.strip()
|
| 24 |
+
else:
|
| 25 |
+
return chosen
|
| 26 |
+
|
| 27 |
+
# --- Main inference function ---
|
| 28 |
+
# If you are using ZeroGPU on Hugging Face Spaces, make sure to set the environment variable USE_GPU=1.
|
| 29 |
+
# The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space.
|
| 30 |
+
@spaces.GPU()
|
| 31 |
+
def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom):
|
| 32 |
+
# Determine which model identifiers to use.
|
| 33 |
+
model1_name = resolve_model(model1_choice, model1_custom)
|
| 34 |
+
model2_name = resolve_model(model2_choice, model2_custom)
|
| 35 |
+
|
| 36 |
+
# Set device to GPU (0) if USE_GPU is enabled; otherwise use CPU (-1)
|
|
|
|
| 37 |
device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1
|
| 38 |
|
| 39 |
+
# Create pipelines for image-to-text.
|
| 40 |
+
# Note: Many instruction-following image models (e.g. BLIP2) accept a text prompt along with an image.
|
| 41 |
+
# We use the "image-to-text" task here so that the prompt is taken into account.
|
| 42 |
+
pipe1 = pipeline("image-to-text", model=model1_name, device=device)
|
| 43 |
+
pipe2 = pipeline("image-to-text", model=model2_name, device=device)
|
| 44 |
|
| 45 |
+
# Run inference on the image with the provided prompt.
|
| 46 |
+
# Depending on the model, the call signature may vary; here we assume a simple call with (image, prompt).
|
| 47 |
+
output1 = pipe1(image, prompt)
|
| 48 |
+
output2 = pipe2(image, prompt)
|
| 49 |
|
| 50 |
+
# Extract the generated text.
|
| 51 |
+
# (Many pipelines return a list of dicts with key 'generated_text'; if not, we simply convert the output to a string.)
|
| 52 |
+
def extract_text(output):
|
| 53 |
+
if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]:
|
| 54 |
+
return output[0]["generated_text"]
|
| 55 |
+
else:
|
| 56 |
+
return str(output)
|
| 57 |
|
| 58 |
+
result1 = extract_text(output1)
|
| 59 |
+
result2 = extract_text(output2)
|
| 60 |
+
|
| 61 |
+
# Format results as chat conversations.
|
| 62 |
+
# Each chatbot conversation is a list of (speaker, message) tuples.
|
| 63 |
+
chat1 = [("User", prompt), ("Bot", result1)]
|
| 64 |
+
chat2 = [("User", prompt), ("Bot", result2)]
|
| 65 |
+
return chat1, chat2
|
| 66 |
|
| 67 |
+
# --- Build the Gradio interface ---
|
| 68 |
+
# Pre-populated sample prompt.
|
| 69 |
+
sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response."
|
| 70 |
+
|
| 71 |
+
with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo:
|
| 72 |
+
gr.Markdown(
|
| 73 |
+
"""
|
| 74 |
+
# Image Text-to-Text Comparison Tool
|
| 75 |
+
Compare two trending image text-to-text (instruction-following) models side-by-side.
|
| 76 |
+
Select a model from the dropdown (or choose Custom to enter your own model identifier) and see how it describes the image.
|
| 77 |
+
"""
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
with gr.Row():
|
| 81 |
+
with gr.Column(scale=1):
|
| 82 |
+
gr.Markdown("## Input")
|
| 83 |
+
image_input = gr.Image(label="Upload an Image", type="pil")
|
| 84 |
+
prompt_input = gr.Textbox(label="Text Prompt", value=sample_prompt, lines=3)
|
| 85 |
+
with gr.Column(scale=1):
|
| 86 |
+
gr.Markdown("## Model Selection")
|
| 87 |
+
with gr.Row():
|
| 88 |
+
with gr.Column():
|
| 89 |
+
gr.Markdown("### Model 1")
|
| 90 |
+
model1_choice = gr.Dropdown(
|
| 91 |
+
choices=TRENDING_MODELS + ["Custom"],
|
| 92 |
+
value=TRENDING_MODELS[0],
|
| 93 |
+
label="Select Model 1"
|
| 94 |
+
)
|
| 95 |
+
model1_custom = gr.Textbox(label="Custom Model 1", placeholder="e.g., username/model_name")
|
| 96 |
+
with gr.Column():
|
| 97 |
+
gr.Markdown("### Model 2")
|
| 98 |
+
model2_choice = gr.Dropdown(
|
| 99 |
+
choices=TRENDING_MODELS + ["Custom"],
|
| 100 |
+
value=TRENDING_MODELS[1],
|
| 101 |
+
label="Select Model 2"
|
| 102 |
+
)
|
| 103 |
+
model2_custom = gr.Textbox(label="Custom Model 2", placeholder="e.g., username/model_name")
|
| 104 |
+
|
| 105 |
+
compare_button = gr.Button("Compare Models")
|
| 106 |
+
|
| 107 |
+
gr.Markdown("## Chatbot Outputs (Side-by-Side)")
|
| 108 |
with gr.Row():
|
| 109 |
+
chatbot1 = gr.Chatbot(label="Model 1 Chatbot")
|
| 110 |
+
chatbot2 = gr.Chatbot(label="Model 2 Chatbot")
|
| 111 |
+
|
| 112 |
+
compare_button.click(
|
| 113 |
+
fn=compare_image_to_text_models,
|
| 114 |
+
inputs=[image_input, prompt_input, model1_choice, model1_custom, model2_choice, model2_custom],
|
| 115 |
+
outputs=[chatbot1, chatbot2]
|
| 116 |
)
|
| 117 |
|
| 118 |
+
demo.launch()
|