Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on May 30

Commit

9e55e35

verified ·

1 Parent(s): c27c463

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -17

app.py CHANGED Viewed

@@ -23,15 +23,24 @@ from transformers.image_utils import load_image
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
-# Increase or disable input truncation to avoid token mismatches
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-MODEL_ID = "nvidia/Cosmos-Reason1-7B"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to("cuda").eval()
@@ -45,13 +54,12 @@ def downsample_video(video_path):
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    # Sample 10 evenly spaced frames.
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
@@ -59,15 +67,25 @@ def downsample_video(video_path):
     return frames
 @spaces.GPU
-def generate_image(text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
-    Generates responses using the Cosmos-Reason1 model for image input.
     """
     if image is None:
         yield "Please upload an image."
         return
@@ -90,7 +108,7 @@ def generate_image(text: str, image: Image.Image,
     ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-    thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
@@ -100,15 +118,25 @@ def generate_image(text: str, image: Image.Image,
         yield buffer
 @spaces.GPU
-def generate_video(text: str, video_path: str,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
-    Generates responses using the Cosmos-Reason1 model for video input.
     """
     if video_path is None:
         yield "Please upload a video."
         return
@@ -118,7 +146,6 @@ def generate_video(text: str, video_path: str,
         {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
-    # Append each frame with its timestamp.
     for frame in frames:
         image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
@@ -143,7 +170,7 @@ def generate_video(text: str, video_path: str,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
-    thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
@@ -163,7 +190,6 @@ video_examples = [
     ["Identify the main actions in the video", "videos/2.mp4"]
 ]
 css = """
 .submit-btn {
     background-color: #2980b9 !important;
@@ -176,13 +202,17 @@ css = """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **Cosmos-Reason1 by [NVIDIA](https://huggingface.co/nvidia/Cosmos-Reason1-7B)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image")
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=image_examples,
@@ -191,6 +221,10 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video")
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=video_examples,
@@ -208,12 +242,12 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     image_submit.click(
         fn=generate_image,
-        inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=output
     )
     video_submit.click(
         fn=generate_video,
-        inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=output
     )

 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load Cosmos-Reason1-7B
+MODEL_ID_M = "nvidia/Cosmos-Reason1-7B"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
+# Load MiMo-VL-7B-RL
+MODEL_ID_X = "XiaomiMiMo/MiMo-VL-7B-RL"
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to("cuda").eval()
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     return frames
 @spaces.GPU
+def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
+    Generates responses using the selected model for image input.
     """
+    if model_name == "Cosmos-Reason1-7B":
+        processor = processor_m
+        model = model_m
+    elif model_name == "MiMo-VL-7B-RL":
+        processor = processor_x
+        model = model_x
+    else:
+        yield "Invalid model selected."
+        return
     if image is None:
         yield "Please upload an image."
         return
     ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         yield buffer
 @spaces.GPU
+def generate_video(model_name: str, text: str, video_path: str,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
+    Generates responses using the selected model for video input.
     """
+    if model_name == "Cosmos-Reason1-7B":
+        processor = processor_m
+        model = model_m
+    elif model_name == "MiMo-VL-7B-RL":
+        processor = processor_x
+        model = model_x
+    else:
+        yield "Invalid model selected."
+        return
     if video_path is None:
         yield "Please upload a video."
         return
         {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
     for frame in frames:
         image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
     ["Identify the main actions in the video", "videos/2.mp4"]
 ]
 css = """
 .submit-btn {
     background-color: #2980b9 !important;
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **Vision-Language Model Inference**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image")
+                    model_choice = gr.Dropdown(
+                        choices=["Cosmos-Reason1-7B", "MiMo-VL-7B-RL"],
+                        label="Select Model",
+                        value="Cosmos-Reason1-7B")
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=image_examples,
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video")
+                    model_choice = gr.Dropdown(
+                        choices=["Cosmos-Reason1-7B", "MiMo-VL-7B-RL"],
+                        label="Select Model",
+                        value="Cosmos-Reason1-7B")
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=video_examples,
     image_submit.click(
         fn=generate_image,
+        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=output
     )
     video_submit.click(
         fn=generate_video,
+        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=output
     )