Update app.py
Browse files
app.py
CHANGED
|
@@ -40,7 +40,7 @@ def extract_frames(video_in, output_format='.jpg'):
|
|
| 40 |
# Adjust interval to video length
|
| 41 |
video_clip = VideoFileClip(video_in)
|
| 42 |
if video_clip.duration <= 5:
|
| 43 |
-
interval =
|
| 44 |
else :
|
| 45 |
interval = 24
|
| 46 |
|
|
@@ -165,9 +165,11 @@ def llm_process(user_prompt):
|
|
| 165 |
|
| 166 |
def infer(video_in):
|
| 167 |
# Extract frames from a video
|
|
|
|
| 168 |
frame_files = extract_frames(video_in)
|
| 169 |
|
| 170 |
# Process each extracted frame and collect results in a list
|
|
|
|
| 171 |
processed_texts = []
|
| 172 |
for frame_file in frame_files:
|
| 173 |
text = process_image(frame_file)
|
|
@@ -184,6 +186,7 @@ def infer(video_in):
|
|
| 184 |
print(extracted_audio)
|
| 185 |
|
| 186 |
# Get description of audio content
|
|
|
|
| 187 |
audio_content_described = get_salmonn(extracted_audio)
|
| 188 |
else :
|
| 189 |
audio_content_described = "Video has no sound."
|
|
@@ -195,6 +198,7 @@ def infer(video_in):
|
|
| 195 |
print(formatted_captions)
|
| 196 |
|
| 197 |
# Send formatted captions to LLM
|
|
|
|
| 198 |
video_description_from_llm = llm_process(formatted_captions)
|
| 199 |
|
| 200 |
return video_description_from_llm
|
|
@@ -213,7 +217,12 @@ div#video-text textarea {
|
|
| 213 |
with gr.Blocks(css=css) as demo :
|
| 214 |
with gr.Column(elem_id="col-container"):
|
| 215 |
gr.HTML("""
|
| 216 |
-
<h2 style="text-align: center;">Soft
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
""")
|
| 218 |
with gr.Row():
|
| 219 |
with gr.Column():
|
|
|
|
| 40 |
# Adjust interval to video length
|
| 41 |
video_clip = VideoFileClip(video_in)
|
| 42 |
if video_clip.duration <= 5:
|
| 43 |
+
interval = 6
|
| 44 |
else :
|
| 45 |
interval = 24
|
| 46 |
|
|
|
|
| 165 |
|
| 166 |
def infer(video_in):
|
| 167 |
# Extract frames from a video
|
| 168 |
+
gr.info("Extracting frames...")
|
| 169 |
frame_files = extract_frames(video_in)
|
| 170 |
|
| 171 |
# Process each extracted frame and collect results in a list
|
| 172 |
+
gr.Info("Captioning frames ...")
|
| 173 |
processed_texts = []
|
| 174 |
for frame_file in frame_files:
|
| 175 |
text = process_image(frame_file)
|
|
|
|
| 186 |
print(extracted_audio)
|
| 187 |
|
| 188 |
# Get description of audio content
|
| 189 |
+
gr.Info("Getting audio description from extracted sound ...")
|
| 190 |
audio_content_described = get_salmonn(extracted_audio)
|
| 191 |
else :
|
| 192 |
audio_content_described = "Video has no sound."
|
|
|
|
| 198 |
print(formatted_captions)
|
| 199 |
|
| 200 |
# Send formatted captions to LLM
|
| 201 |
+
gr.Info("Try to provide a video understanding with provided elements ...")
|
| 202 |
video_description_from_llm = llm_process(formatted_captions)
|
| 203 |
|
| 204 |
return video_description_from_llm
|
|
|
|
| 217 |
with gr.Blocks(css=css) as demo :
|
| 218 |
with gr.Column(elem_id="col-container"):
|
| 219 |
gr.HTML("""
|
| 220 |
+
<h2 style="text-align: center;">Soft Video Understanding</h2>
|
| 221 |
+
<p style="text-align: center;">
|
| 222 |
+
An experiment to try to achieve what i call "soft video understanding" with open-source available models. <br />
|
| 223 |
+
We use moondream1 to caption extracted frames, salmonn to analyze extracted audio, then send visual and audio details to Zephyr which is instructed to resume what it understood.
|
| 224 |
+
Instructions prompt is available for further discussion with the Community.
|
| 225 |
+
</p>
|
| 226 |
""")
|
| 227 |
with gr.Row():
|
| 228 |
with gr.Column():
|