Explainable-Vision-Language-Model

Running on Zero

fffiloni commited on Jan 10

Commit

44284c4

verified ·

1 Parent(s): bc31078

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -88,8 +88,19 @@ def image_vision(image_input_path, prompt):
     else:
         return answer, None
-def video_vision(video_input_path, prompt):
-    vid_frames, image_paths = read_video(video_input_path, video_interval=1)
     # create a question (<image> is a placeholder for the video frames)
     question = f"<image>{prompt}"
     result = model.predict_forward(
@@ -179,6 +190,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             with gr.Row():
                 with gr.Column():
                     video_input = gr.Video(label="Video IN")
                     with gr.Row():
                         vid_instruction = gr.Textbox(label="Instruction", scale=4)
                         submit_video_btn = gr.Button("Submit", scale=1)
@@ -188,7 +200,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             submit_video_btn.click(
                 fn = video_vision,
-                inputs = [video_input, vid_instruction],
                 outputs = [vid_output_res, output_video]
             )

     else:
         return answer, None
+def video_vision(video_input_path, prompt, video_interval):
+    # Open the original video
+    cap = cv2.VideoCapture(input_video_path)
+    # Get original video properties
+    original_fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_skip_factor = video_interval
+    # Calculate new FPS
+    new_fps = original_fps / frame_skip_factor
+    vid_frames, image_paths = read_video(video_input_path, video_interval)
     # create a question (<image> is a placeholder for the video frames)
     question = f"<image>{prompt}"
     result = model.predict_forward(
             with gr.Row():
                 with gr.Column():
                     video_input = gr.Video(label="Video IN")
+                    frame_interval = gr.Slider(label="Frame interval", minimum=1, maximum=12, value=6)
                     with gr.Row():
                         vid_instruction = gr.Textbox(label="Instruction", scale=4)
                         submit_video_btn = gr.Button("Submit", scale=1)
             submit_video_btn.click(
                 fn = video_vision,
+                inputs = [video_input, vid_instruction, frame_interval],
                 outputs = [vid_output_res, output_video]
             )