Molmo-7B-D-0924

Running on Zero

App Files Files Community

zamal commited on Oct 11, 2024

Commit

fbbadab

verified ·

1 Parent(s): f07f172

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -66

app.py CHANGED Viewed

@@ -1,72 +1,60 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoProcessor
 from PIL import Image
-import torch
-import os
-import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-os.system('pip install -U bitsandbytes')
-# Define the repository for the quantized model
-repo_name = "cyan2k/molmo-7B-D-bnb-4bit"
-# Load processor and model with GPU optimization
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-processor = AutoProcessor.from_pretrained(repo_name, trust_remote_code=True)
-# Load model with 4-bit quantization
-model = AutoModelForCausalLM.from_pretrained(repo_name,
-                                             device_map="auto",
-                                             torch_dtype=torch.float16,
-                                             load_in_4bit=True,
-                                             trust_remote_code=True)
-model.to(device)
-def process_image_and_text(image, text):
-    # Convert numpy image to PIL format
-    pil_image = Image.fromarray(image)
-    # Process image and text with processor
-    inputs = processor(images=[pil_image], text=text, return_tensors="pt").to(device)
-    # Generate output using the model
-    output = model.generate(**inputs, max_new_tokens=200)
-    # Decode the generated output
-    generated_text = processor.decode(output[0], skip_special_tokens=True)
     return generated_text
-def chatbot(image, text, history):
-    # Check if the image is uploaded
-    if image is None:
-        return history + [("Please upload an image first.", None)]
-    # Get response by processing the image and text
-    response = process_image_and_text(image, text)
-    # Append question and response to the chat history
-    history.append((text, response))
-    return history
-# Define the Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# Image Chatbot with Molmo-7B-4 Bit Quantized")
-    with gr.Row():
-        image_input = gr.Image(type="numpy")
-        chatbot_output = gr.Chatbot()
-    text_input = gr.Textbox(placeholder="Ask a question about the image...")
-    submit_button = gr.Button("Submit")
-    state = gr.State([])
-    # Connect the submit button and textbox to the chatbot function
-    submit_button.click(fn=chatbot, inputs=[image_input, text_input, state], outputs=chatbot_output)
-    text_input.submit(fn=chatbot, inputs=[image_input, text_input, state], outputs=chatbot_output)
-# Launch the Gradio app with GPU
-demo.launch(share=True)

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
 from PIL import Image
+# Load the model and processor
+repo_name = "cyan2k/molmo-7B-O-bnb-4bit"
+arguments = {
+    "device_map": "auto",
+    "torch_dtype": "auto",
+    "trust_remote_code": True,
+    "load_in_8bit": True  # Use 8-bit for reduced memory footprint
+}
+# Load the processor and model
+processor = AutoProcessor.from_pretrained(repo_name, **arguments)
+model = AutoModelForCausalLM.from_pretrained(repo_name, **arguments)
+def describe_image(image):
+    # Process the uploaded image
+    inputs = processor.process(
+        images=[image],
+        text="Describe this image in great detail."
+    )
+    # Move inputs to model device
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Removed unsqueeze(0) to keep batch size
+    # Generate output
+    output = model.generate_from_batch(
+        inputs,
+        GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
+        tokenizer=processor.tokenizer,
+    )
+    # Decode the generated tokens
+    generated_tokens = output[0, inputs["input_ids"].size(1):]
+    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
     return generated_text
+def gradio_app():
+    # Define Gradio interface
+    image_input = gr.Image(type="pil", label="Upload Image")
+    output_text = gr.Textbox(label="Image Description", interactive=False)
+    # Create Gradio interface
+    interface = gr.Interface(
+        fn=describe_image,
+        inputs=image_input,
+        outputs=output_text,
+        title="Image Description App",
+        description="Upload an image and get a detailed description using the Molmo 7B model"
+    )
+    # Launch the interface
+    interface.launch()
+# Launch the Gradio app
+gradio_app()