Spaces:

TIGER-Lab
/

Mantis

Running on Zero

App Files Files Community

DongfuJiang commited on May 5, 2024

Commit

669c11e

1 Parent(s): 75c15ae

update

Browse files

Files changed (2) hide show

app.py +18 -9
models/mllava/utils.py +40 -8

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 import time
 from PIL import Image
 import functools
-from models.mllava import MLlavaProcessor, LlavaForConditionalGeneration, chat_mllava_stream, MLlavaForConditionalGeneration
 from models.conversation import conv_templates
 from typing import List
 processor = MLlavaProcessor.from_pretrained("TIGER-Lab/Mantis-8B-siglip-llama3")
@@ -12,7 +12,7 @@ model = LlavaForConditionalGeneration.from_pretrained("TIGER-Lab/Mantis-8B-sigli
 conv_template = conv_templates['llama_3']
 @spaces.GPU
-def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
     global processor, model
     model = model.to("cuda")
     if not images:
@@ -22,6 +22,15 @@ def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
     return text
 def enable_next_image(uploaded_images, image):
     uploaded_images.append(image)
     return uploaded_images, gr.MultimodalTextbox(value=None, interactive=False)
@@ -87,15 +96,14 @@ def bot(history):
     chat_history = get_chat_history(history)
     chat_images = get_chat_images(history)
     generation_kwargs = {
         "max_new_tokens": 4096,
-        "temperature": 0.2,
-        "top_p": 1.0,
-        "do_sample": True,
     }
-    print(None, chat_images, chat_history, generation_kwargs)
-    response = generate(None, chat_images, chat_history, **generation_kwargs)
     for _output in response:
         history[-1][1] = _output
         time.sleep(0.05)
@@ -191,7 +199,8 @@ Mantis is a multimodal conversational AI model that can chat with users about im
   author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
   journal={arXiv preprint arXiv:2405.01483},
   year={2024}
-}```""")
     return demo

 import time
 from PIL import Image
 import functools
+from models.mllava import MLlavaProcessor, LlavaForConditionalGeneration, chat_mllava_stream, MLlavaForConditionalGeneration, chat_mllava
 from models.conversation import conv_templates
 from typing import List
 processor = MLlavaProcessor.from_pretrained("TIGER-Lab/Mantis-8B-siglip-llama3")
 conv_template = conv_templates['llama_3']
 @spaces.GPU
+def generate_stream(text:str, images:List[Image.Image], history: List[dict], **kwargs):
     global processor, model
     model = model.to("cuda")
     if not images:
     return text
+@spaces.GPU
+def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
+    global processor, model
+    model = model.to("cuda")
+    if not images:
+        images = None
+    generated_text, history = chat_mllava(text, images, model, processor, history=history, **kwargs)
+    return generated_text
 def enable_next_image(uploaded_images, image):
     uploaded_images.append(image)
     return uploaded_images, gr.MultimodalTextbox(value=None, interactive=False)
     chat_history = get_chat_history(history)
     chat_images = get_chat_images(history)
     generation_kwargs = {
         "max_new_tokens": 4096,
+        "num_beams": 1,
+        "do_sample": False
     }
+    response = generate_stream(None, chat_images, chat_history, **generation_kwargs)
     for _output in response:
         history[-1][1] = _output
         time.sleep(0.05)
   author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
   journal={arXiv preprint arXiv:2405.01483},
   year={2024}
+}
+```""")
     return demo

models/mllava/utils.py CHANGED Viewed

@@ -46,10 +46,27 @@ def chat_mllava(
         for message in history:
             assert message["role"] in conv.roles
             conv.append_message(message["role"], message["text"])
     else:
         history = []
-    conv.append_message(conv.roles[0], text)
-    conv.append_message(conv.roles[1], "")
     prompt = conv.get_prompt()
     if images:
@@ -75,8 +92,7 @@ def chat_mllava(
     generated_ids = output_ids[inputs["input_ids"].shape[-1]:]
     generated_text = processor.decode(generated_ids, skip_special_tokens=True)
-    history.append({"role": conv.roles[0], "text": text})
-    history.append({"role": conv.roles[1], "text": generated_text})
     return generated_text, history
@@ -120,10 +136,27 @@ def chat_mllava_stream(
         for message in history:
             assert message["role"] in conv.roles
             conv.append_message(message["role"], message["text"])
     else:
         history = []
-    conv.append_message(conv.roles[0], text)
-    conv.append_message(conv.roles[1], "")
     prompt = conv.get_prompt()
     if images:
@@ -132,6 +165,7 @@ def chat_mllava_stream(
                 images[i] = PIL.Image.open(images[i])
     inputs = processor(images=images, text=prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
     for k, v in inputs.items():
         if v is not None:
             if isinstance(v, torch.Tensor):
@@ -148,8 +182,6 @@ def chat_mllava_stream(
     inputs.update(kwargs)
     thread = Thread(target=model.generate, kwargs=inputs)
     thread.start()
-    history.append({"role": conv.roles[0], "text": text})
-    history.append({"role": conv.roles[1], "text": ""})
     for _output in streamer:
         history[-1]["text"] += _output
         yield history[-1]["text"], history

         for message in history:
             assert message["role"] in conv.roles
             conv.append_message(message["role"], message["text"])
+        if text:
+            assert conv.messages[-1][0] == conv.roles[1], "The last message in the history should be the assistant, if the given text is not empty"
+            conv.append_message(conv.roles[0], text)
+            conv.append_message(conv.roles[1], "")
+            history.append({"role": conv.roles[0], "text": text})
+            history.append({"role": conv.roles[1], "text": ""})
+        else:
+            if conv.messages[-1][0] == conv.roles[1]:
+                assert conv.messages[-1][1] == "", "No user message should be provided"
+            else:
+                assert conv.messages[-1][0] == conv.roles[0], "The last message in the history should be the user, if the given text is empty"
+                conv.append_message(conv.roles[0], "")
+                history.append({"role": conv.roles[0], "text": ""})
     else:
         history = []
+        history.append({"role": conv.roles[0], "text": text})
+        history.append({"role": conv.roles[1], "text": ""})
+        conv.append_message(conv.roles[0], text)
+        conv.append_message(conv.roles[1], "")
+    assert conv.messages[-1][0] == conv.roles[1] and conv.messages[-1][1] == "", "Format check"
+    assert history[-1]["role"] == conv.roles[1] and history[-1]["text"] == "", "Format check"
     prompt = conv.get_prompt()
     if images:
     generated_ids = output_ids[inputs["input_ids"].shape[-1]:]
     generated_text = processor.decode(generated_ids, skip_special_tokens=True)
+    history[-1]["text"] = generated_text
     return generated_text, history
         for message in history:
             assert message["role"] in conv.roles
             conv.append_message(message["role"], message["text"])
+        if text:
+            assert conv.messages[-1][0] == conv.roles[1], "The last message in the history should be the assistant, if the given text is not empty"
+            conv.append_message(conv.roles[0], text)
+            conv.append_message(conv.roles[1], "")
+            history.append({"role": conv.roles[0], "text": text})
+            history.append({"role": conv.roles[1], "text": ""})
+        else:
+            if conv.messages[-1][0] == conv.roles[1]:
+                assert conv.messages[-1][1] == "", "No user message should be provided"
+            else:
+                assert conv.messages[-1][0] == conv.roles[0], "The last message in the history should be the user, if the given text is empty"
+                conv.append_message(conv.roles[0], "")
+                history.append({"role": conv.roles[0], "text": ""})
     else:
         history = []
+        history.append({"role": conv.roles[0], "text": text})
+        history.append({"role": conv.roles[1], "text": ""})
+        conv.append_message(conv.roles[0], text)
+        conv.append_message(conv.roles[1], "")
+    assert conv.messages[-1][0] == conv.roles[1] and conv.messages[-1][1] == "", "Format check"
+    assert history[-1]["role"] == conv.roles[1] and history[-1]["text"] == "", "Format check"
     prompt = conv.get_prompt()
     if images:
                 images[i] = PIL.Image.open(images[i])
     inputs = processor(images=images, text=prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
+    print(processor.tokenizer.decode(inputs["input_ids"][0]))
     for k, v in inputs.items():
         if v is not None:
             if isinstance(v, torch.Tensor):
     inputs.update(kwargs)
     thread = Thread(target=model.generate, kwargs=inputs)
     thread.start()
     for _output in streamer:
         history[-1]["text"] += _output
         yield history[-1]["text"], history