Spaces:

flax-community
/

Multilingual-VQA

Runtime error

App Files Files Community

gchhablani commited on Jul 18, 2021

Commit

690384a

1 Parent(s): 0e71038

Fix style

Browse files

Files changed (4) hide show

app.py +63 -35
requirements.txt +2 -1
translate_answer_mapping.py +9 -3
utils.py +16 -5

app.py CHANGED Viewed

@@ -5,8 +5,17 @@ import json
 import os
 import numpy as np
 from streamlit.elements import markdown
-from model.flax_clip_vision_bert.modeling_clip_vision_bert import FlaxCLIPVisionBertForSequenceClassification
-from utils import get_transformed_image, get_text_attributes, get_top_5_predictions, plotly_express_horizontal_bar_plot, translate_labels
 import matplotlib.pyplot as plt
 from mtranslate import translate
 from PIL import Image
@@ -16,23 +25,30 @@ from session import _get_state
 state = _get_state()
 @st.cache(persist=True)
 def load_model(ckpt):
     return FlaxCLIPVisionBertForSequenceClassification.from_pretrained(ckpt)
 @st.cache(persist=True)
 def predict(transformed_image, question_inputs):
-    return np.array(model(pixel_values = transformed_image, **question_inputs)[0][0])
 def softmax(logits):
-    return np.exp(logits)/np.sum(np.exp(logits), axis=0)
 def read_markdown(path, parent="./sections/"):
-    with open(os.path.join(parent,path)) as f:
         return f.read()
-checkpoints = ['./ckpt/ckpt-60k-5999'] # TODO: Maybe add more checkpoints?
-dummy_data = pd.read_csv('dummy_vqa_multilingual.tsv', sep='\t')
 code_to_name = {
     "en": "English",
     "fr": "French",
@@ -40,7 +56,7 @@ code_to_name = {
     "es": "Spanish",
 }
-with open('answer_reverse_mapping.json') as f:
     answer_reverse_mapping = json.load(f)
@@ -52,7 +68,9 @@ st.set_page_config(
 )
 st.title("Multilingual Visual Question Answering")
-st.write("[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)")
 with st.beta_expander("Usage"):
     st.markdown(read_markdown("usage.md"))
@@ -60,67 +78,77 @@ with st.beta_expander("Usage"):
 first_index = 20
 # Init Session State
 if state.image_file is None:
-    state.image_file = dummy_data.loc[first_index,'image_file']
-    state.question = dummy_data.loc[first_index,'question'].strip('- ')
-    state.answer_label = dummy_data.loc[first_index,'answer_label']
-    state.question_lang_id = dummy_data.loc[first_index, 'lang_id']
-    state.answer_lang_id = dummy_data.loc[first_index, 'lang_id']
-    image_path = os.path.join('images',state.image_file)
     image = plt.imread(image_path)
     state.image = image
-col1, col2 = st.beta_columns([6,4])
-if col2.button('Get a random example'):
     sample = dummy_data.sample(1).reset_index()
-    state.image_file = sample.loc[0,'image_file']
-    state.question = sample.loc[0,'question'].strip('- ')
-    state.answer_label = sample.loc[0,'answer_label']
-    state.question_lang_id = sample.loc[0, 'lang_id']
-    state.answer_lang_id = sample.loc[0, 'lang_id']
-    image_path = os.path.join('images',state.image_file)
     image = plt.imread(image_path)
     state.image = image
 col2.write("OR")
-uploaded_file = col2.file_uploader('Upload your image', type=['png','jpg','jpeg'])
 if uploaded_file is not None:
-    state.image_file = os.path.join('images/val2014',uploaded_file.name)
     state.image = np.array(Image.open(uploaded_file))
 transformed_image = get_transformed_image(state.image)
 # Display Image
-col1.image(state.image, use_column_width='always')
 # Display Question
 question = col2.text_input(label="Question", value=state.question)
-col2.markdown(f"""**English Translation**: {question if state.question_lang_id == "en" else translate(question, 'en')}""")
 question_inputs = get_text_attributes(question)
 # Select Language
-options = ['en', 'de', 'es', 'fr']
-state.answer_lang_id = col2.selectbox('Answer Language', index=options.index(state.answer_lang_id), options=options, format_func = lambda x: code_to_name[x])
 # Display Top-5 Predictions
-with st.spinner('Loading model...'):
     model = load_model(checkpoints[0])
-with st.spinner('Predicting...'):
     logits = predict(transformed_image, dict(question_inputs))
 logits = softmax(logits)
 labels, values = get_top_5_predictions(logits, answer_reverse_mapping)
 translated_labels = translate_labels(labels, state.answer_lang_id)
 fig = plotly_express_horizontal_bar_plot(values, translated_labels)
-st.plotly_chart(fig, use_container_width = True)
 st.write(read_markdown("abstract.md"))
 st.write(read_markdown("caveats.md"))
 st.write("# Methodology")
-st.image("./misc/Multilingual-VQA.png", caption="Masked LM model for Image-text Pretraining.")
 st.markdown(read_markdown("pretraining.md"))
 st.markdown(read_markdown("finetuning.md"))
 st.write(read_markdown("challenges.md"))

 import os
 import numpy as np
 from streamlit.elements import markdown
+import cv2
+from model.flax_clip_vision_bert.modeling_clip_vision_bert import (
+    FlaxCLIPVisionBertForSequenceClassification,
+)
+from utils import (
+    get_transformed_image,
+    get_text_attributes,
+    get_top_5_predictions,
+    plotly_express_horizontal_bar_plot,
+    translate_labels,
+)
 import matplotlib.pyplot as plt
 from mtranslate import translate
 from PIL import Image
 state = _get_state()
 @st.cache(persist=True)
 def load_model(ckpt):
     return FlaxCLIPVisionBertForSequenceClassification.from_pretrained(ckpt)
 @st.cache(persist=True)
 def predict(transformed_image, question_inputs):
+    return np.array(model(pixel_values=transformed_image, **question_inputs)[0][0])
 def softmax(logits):
+    return np.exp(logits) / np.sum(np.exp(logits), axis=0)
 def read_markdown(path, parent="./sections/"):
+    with open(os.path.join(parent, path)) as f:
         return f.read()
+def resize_height(image, new_height):
+    h, w, c = image.shape
+checkpoints = ["./ckpt/ckpt-60k-5999"]  # TODO: Maybe add more checkpoints?
+dummy_data = pd.read_csv("dummy_vqa_multilingual.tsv", sep="\t")
 code_to_name = {
     "en": "English",
     "fr": "French",
     "es": "Spanish",
 }
+with open("answer_reverse_mapping.json") as f:
     answer_reverse_mapping = json.load(f)
 )
 st.title("Multilingual Visual Question Answering")
+st.write(
+    "[Gunjan Chhablani](https://huggingface.co/gchhablani), [Bhavitvya Malik](https://huggingface.co/bhavitvyamalik)"
+)
 with st.beta_expander("Usage"):
     st.markdown(read_markdown("usage.md"))
 first_index = 20
 # Init Session State
 if state.image_file is None:
+    state.image_file = dummy_data.loc[first_index, "image_file"]
+    state.question = dummy_data.loc[first_index, "question"].strip("- ")
+    state.answer_label = dummy_data.loc[first_index, "answer_label"]
+    state.question_lang_id = dummy_data.loc[first_index, "lang_id"]
+    state.answer_lang_id = dummy_data.loc[first_index, "lang_id"]
+    image_path = os.path.join("images", state.image_file)
     image = plt.imread(image_path)
     state.image = image
+col1, col2 = st.beta_columns([6, 4])
+if col2.button("Get a random example"):
     sample = dummy_data.sample(1).reset_index()
+    state.image_file = sample.loc[0, "image_file"]
+    state.question = sample.loc[0, "question"].strip("- ")
+    state.answer_label = sample.loc[0, "answer_label"]
+    state.question_lang_id = sample.loc[0, "lang_id"]
+    state.answer_lang_id = sample.loc[0, "lang_id"]
+    image_path = os.path.join("images", state.image_file)
     image = plt.imread(image_path)
     state.image = image
 col2.write("OR")
+uploaded_file = col2.file_uploader("Upload your image", type=["png", "jpg", "jpeg"])
 if uploaded_file is not None:
+    state.image_file = os.path.join("images/val2014", uploaded_file.name)
     state.image = np.array(Image.open(uploaded_file))
+state.image =
 transformed_image = get_transformed_image(state.image)
 # Display Image
+col1.image(state.image, use_column_width="always")
 # Display Question
 question = col2.text_input(label="Question", value=state.question)
+col2.markdown(
+    f"""**English Translation**: {question if state.question_lang_id == "en" else translate(question, 'en')}"""
+)
 question_inputs = get_text_attributes(question)
 # Select Language
+options = ["en", "de", "es", "fr"]
+state.answer_lang_id = col2.selectbox(
+    "Answer Language",
+    index=options.index(state.answer_lang_id),
+    options=options,
+    format_func=lambda x: code_to_name[x],
+)
 # Display Top-5 Predictions
+with st.spinner("Loading model..."):
     model = load_model(checkpoints[0])
+with st.spinner("Predicting..."):
     logits = predict(transformed_image, dict(question_inputs))
 logits = softmax(logits)
 labels, values = get_top_5_predictions(logits, answer_reverse_mapping)
 translated_labels = translate_labels(labels, state.answer_lang_id)
 fig = plotly_express_horizontal_bar_plot(values, translated_labels)
+st.plotly_chart(fig, use_container_width=True)
 st.write(read_markdown("abstract.md"))
 st.write(read_markdown("caveats.md"))
 st.write("# Methodology")
+st.image(
+    "./misc/Multilingual-VQA.png", caption="Masked LM model for Image-text Pretraining."
+)
 st.markdown(read_markdown("pretraining.md"))
 st.markdown(read_markdown("finetuning.md"))
 st.write(read_markdown("challenges.md"))

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ git+https://github.com/huggingface/transformers.git
 torchvision==0.10.0
 mtranslate==1.8
 black==21.7b0
-flax==0.3.4

 torchvision==0.10.0
 mtranslate==1.8
 black==21.7b0
+flax==0.3.4
+opencv-python==4.5.3

translate_answer_mapping.py CHANGED Viewed

@@ -4,6 +4,7 @@ from tqdm import tqdm
 import ray
 from asyncio import Event
 from ray.actor import ActorHandle
 ray.init()
 from typing import Tuple
@@ -48,6 +49,7 @@ class ProgressBarActor:
         """
         return self.counter
 class ProgressBar:
     progress_actor: ActorHandle
     total: int
@@ -89,14 +91,16 @@ class ProgressBar:
 with open("answer_reverse_mapping.json") as f:
     answer_reverse_mapping = json.load(f)
 @ray.remote
 def translate_answer(value, pba):
     temp = {}
     for lang in ["fr", "es", "de"]:
-        temp.update({lang: translate(value, lang, 'en')})
     pba.update.remote(1)
     return temp
 translation_dicts = []
 pb = ProgressBar(len(answer_reverse_mapping.values()))
 actor = pb.actor
@@ -104,8 +108,10 @@ for value in answer_reverse_mapping.values():
     translation_dicts.append(translate_answer.remote(value, actor))
 pb.print_until_done()
-translation_dict = dict(zip(answer_reverse_mapping.values(),ray.get(translation_dicts)))
 with open("translation_dict.json", "w") as f:
-    json.dump(translation_dict, f)

 import ray
 from asyncio import Event
 from ray.actor import ActorHandle
 ray.init()
 from typing import Tuple
         """
         return self.counter
 class ProgressBar:
     progress_actor: ActorHandle
     total: int
 with open("answer_reverse_mapping.json") as f:
     answer_reverse_mapping = json.load(f)
 @ray.remote
 def translate_answer(value, pba):
     temp = {}
     for lang in ["fr", "es", "de"]:
+        temp.update({lang: translate(value, lang, "en")})
     pba.update.remote(1)
     return temp
 translation_dicts = []
 pb = ProgressBar(len(answer_reverse_mapping.values()))
 actor = pb.actor
     translation_dicts.append(translate_answer.remote(value, actor))
 pb.print_until_done()
+translation_dict = dict(
+    zip(answer_reverse_mapping.values(), ray.get(translation_dicts))
+)
 with open("translation_dict.json", "w") as f:
+    json.dump(translation_dict, f)

utils.py CHANGED Viewed

@@ -7,6 +7,8 @@ from transformers import BertTokenizerFast
 import plotly.express as px
 import json
 from PIL import Image
 class Transform(torch.nn.Module):
     def __init__(self, image_size):
         super().__init__()
@@ -31,7 +33,7 @@ transform = Transform(224)
 def get_transformed_image(image):
     if image.shape[-1] == 3 and isinstance(image, np.ndarray):
-        image = image.transpose(2,0,1)
         image = torch.tensor(image)
     return transform(image).unsqueeze(0).permute(0, 2, 3, 1).numpy()
@@ -49,13 +51,15 @@ def get_top_5_predictions(logits, answer_reverse_mapping):
     labels = [answer_reverse_mapping[str(i)] for i in indices]
     return labels, values
-with open('translation_dict.json') as f:
     translate_dict = json.load(f)
 def translate_labels(labels, lang_id):
     translated_labels = []
     for label in labels:
-        if label=="<unk>":
             translated_labels.append("<unk>")
         elif lang_id == "en":
             translated_labels.append(label)
@@ -65,5 +69,12 @@ def translate_labels(labels, lang_id):
 def plotly_express_horizontal_bar_plot(values, labels):
-    fig = px.bar(x=values, y=labels, text = [format(value, ".3%") for value in values], title="Top-5 Predictions", labels={"x": "Scores", "y":"Answers"}, orientation="h")
-    return fig

 import plotly.express as px
 import json
 from PIL import Image
 class Transform(torch.nn.Module):
     def __init__(self, image_size):
         super().__init__()
 def get_transformed_image(image):
     if image.shape[-1] == 3 and isinstance(image, np.ndarray):
+        image = image.transpose(2, 0, 1)
         image = torch.tensor(image)
     return transform(image).unsqueeze(0).permute(0, 2, 3, 1).numpy()
     labels = [answer_reverse_mapping[str(i)] for i in indices]
     return labels, values
+with open("translation_dict.json") as f:
     translate_dict = json.load(f)
 def translate_labels(labels, lang_id):
     translated_labels = []
     for label in labels:
+        if label == "<unk>":
             translated_labels.append("<unk>")
         elif lang_id == "en":
             translated_labels.append(label)
 def plotly_express_horizontal_bar_plot(values, labels):
+    fig = px.bar(
+        x=values,
+        y=labels,
+        text=[format(value, ".3%") for value in values],
+        title="Top-5 Predictions",
+        labels={"x": "Scores", "y": "Answers"},
+        orientation="h",
+    )
+    return fig