Spaces:

fffiloni
/

soft-video-understanding

Paused

App Files Files Community

fffiloni commited on Mar 8, 2024

Commit

fa29376

verified ·

1 Parent(s): cd5d77e

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -2

app.py CHANGED Viewed

@@ -40,7 +40,7 @@ def extract_frames(video_in, output_format='.jpg'):
     # Adjust interval to video length
     video_clip = VideoFileClip(video_in)
     if video_clip.duration <= 5:
-        interval = 12
     else :
         interval = 24
@@ -165,9 +165,11 @@ def llm_process(user_prompt):
 def infer(video_in):
     # Extract frames from a video
     frame_files = extract_frames(video_in)
     # Process each extracted frame and collect results in a list
     processed_texts = []
     for frame_file in frame_files:
         text = process_image(frame_file)
@@ -184,6 +186,7 @@ def infer(video_in):
         print(extracted_audio)
         # Get description of audio content
         audio_content_described = get_salmonn(extracted_audio)
     else :
         audio_content_described = "Video has no sound."
@@ -195,6 +198,7 @@ def infer(video_in):
     print(formatted_captions)
     # Send formatted captions to LLM
     video_description_from_llm = llm_process(formatted_captions)
     return video_description_from_llm
@@ -213,7 +217,12 @@ div#video-text textarea {
 with gr.Blocks(css=css) as demo :
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
-        <h2 style="text-align: center;">Soft video understanding</h2>
         """)
         with gr.Row():
             with gr.Column():

     # Adjust interval to video length
     video_clip = VideoFileClip(video_in)
     if video_clip.duration <= 5:
+        interval = 6
     else :
         interval = 24
 def infer(video_in):
     # Extract frames from a video
+    gr.info("Extracting frames...")
     frame_files = extract_frames(video_in)
     # Process each extracted frame and collect results in a list
+    gr.Info("Captioning frames ...")
     processed_texts = []
     for frame_file in frame_files:
         text = process_image(frame_file)
         print(extracted_audio)
         # Get description of audio content
+        gr.Info("Getting audio description from extracted sound ...")
         audio_content_described = get_salmonn(extracted_audio)
     else :
         audio_content_described = "Video has no sound."
     print(formatted_captions)
     # Send formatted captions to LLM
+    gr.Info("Try to provide a video understanding with provided elements ...")
     video_description_from_llm = llm_process(formatted_captions)
     return video_description_from_llm
 with gr.Blocks(css=css) as demo :
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
+        <h2 style="text-align: center;">Soft Video Understanding</h2>
+        <p style="text-align: center;">
+            An experiment to try to achieve what i call "soft video understanding" with open-source available models. <br />
+            We use moondream1 to caption extracted frames, salmonn to analyze extracted audio, then send visual and audio details to Zephyr which is instructed to resume what it understood.
+            Instructions prompt is available for further discussion with the Community.
+        </p>
         """)
         with gr.Row():
             with gr.Column():