HaithamIsmail
commited on
Commit
Β·
e17f518
1
Parent(s):
7835c3b
add video link, fixed frames sampling
Browse files
README.md
CHANGED
|
@@ -63,6 +63,8 @@ Here is the revised README that incorporates these setup instructions into a com
|
|
| 63 |
|
| 64 |
---
|
| 65 |
|
|
|
|
|
|
|
| 66 |
# π¬ HyperClipper: Your AI Video Librarian π€
|
| 67 |
|
| 68 |
Tired of scrubbing through hours of video to find that *one* perfect moment? HyperClipper is your personal AI video librarian that watches, understands, and catalogs your entire video library, making every second instantly searchable.
|
|
|
|
| 63 |
|
| 64 |
---
|
| 65 |
|
| 66 |
+
### Demo Video: https://youtu.be/S-Sbn1NTWq0
|
| 67 |
+
|
| 68 |
# π¬ HyperClipper: Your AI Video Librarian π€
|
| 69 |
|
| 70 |
Tired of scrubbing through hours of video to find that *one* perfect moment? HyperClipper is your personal AI video librarian that watches, understands, and catalogs your entire video library, making every second instantly searchable.
|
app.py
CHANGED
|
@@ -11,7 +11,7 @@ import shutil
|
|
| 11 |
from utils import get_text_embedding, sample_from_video, convert_image_to_base64
|
| 12 |
from config import load_config
|
| 13 |
from lancedb_utils import retreive_clip
|
| 14 |
-
import
|
| 15 |
|
| 16 |
app_config = load_config()
|
| 17 |
langchain_message_history = []
|
|
@@ -174,9 +174,9 @@ def get_clip(clip_id: str):
|
|
| 174 |
list: list of frames
|
| 175 |
"""
|
| 176 |
print("clip id", clip_id)
|
| 177 |
-
clip = retreive_clip(clip_id)
|
| 178 |
images = sample_from_video(clip["clip_path"])
|
| 179 |
-
base64_images = [convert_image_to_base64(image) for image in images]
|
| 180 |
return base64_images
|
| 181 |
|
| 182 |
def search_and_display_clips(query_text):
|
|
@@ -211,7 +211,7 @@ def chat_agent(message, history: list):
|
|
| 211 |
# Add current message
|
| 212 |
langchain_message_history.append({"role": "user", "content": message})
|
| 213 |
|
| 214 |
-
llm_with_tool = chat_model.bind_tools(tools=[get_relevant_clips
|
| 215 |
tools = {"get_relevant_clips": get_relevant_clips}
|
| 216 |
|
| 217 |
# The agent loop
|
|
@@ -244,6 +244,7 @@ def chat_agent_mm(message, history):
|
|
| 244 |
global latest_search_results, langchain_message_history
|
| 245 |
|
| 246 |
langchain_message_history.append({"role": "user", "content": message})
|
|
|
|
| 247 |
|
| 248 |
print(langchain_message_history)
|
| 249 |
llm_with_tool = chat_model_vlm.bind_tools(tools=[get_relevant_clips, get_clip])
|
|
@@ -258,14 +259,40 @@ def chat_agent_mm(message, history):
|
|
| 258 |
|
| 259 |
for tool_call in ai_response.tool_calls:
|
| 260 |
print(tool_call)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
tool_output = tools[tool_call["name"]].invoke(tool_call)
|
| 262 |
if tool_call["name"] == "get_clip":
|
| 263 |
-
|
| 264 |
-
"role": "tool",
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
else:
|
| 270 |
tool_call_log = {
|
| 271 |
"role": "tool",
|
|
@@ -273,6 +300,7 @@ def chat_agent_mm(message, history):
|
|
| 273 |
"content": tool_output.content
|
| 274 |
}
|
| 275 |
langchain_message_history.append(tool_call_log)
|
|
|
|
| 276 |
|
| 277 |
content = ai_response.content
|
| 278 |
if "</think>" in content:
|
|
@@ -281,7 +309,8 @@ def chat_agent_mm(message, history):
|
|
| 281 |
# The global state `latest_search_results` is updated by the tool.
|
| 282 |
# The text response is returned.
|
| 283 |
langchain_message_history.append({"role": "assistant", "content": content})
|
| 284 |
-
|
|
|
|
| 285 |
|
| 286 |
def get_latest_clips_for_display():
|
| 287 |
"""Get the latest search results for display in the UI."""
|
|
@@ -629,8 +658,8 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
|
|
| 629 |
original_filename = "uploaded_video.mp4"
|
| 630 |
temp_dir = tempfile.mkdtemp()
|
| 631 |
tmp_path = os.path.join(temp_dir, original_filename)
|
| 632 |
-
|
| 633 |
-
|
| 634 |
|
| 635 |
# Run the video processing pipeline
|
| 636 |
run_pipeline(tmp_path)
|
|
@@ -644,7 +673,6 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
|
|
| 644 |
|
| 645 |
return f"β
Video analysis complete for '{original_filename}'. You can now search for clips from this video."
|
| 646 |
except Exception as e:
|
| 647 |
-
traceback.print_exc()
|
| 648 |
return f"β Error during video analysis: {str(e)}"
|
| 649 |
|
| 650 |
analyze_btn.click(
|
|
@@ -656,10 +684,9 @@ with gr.Blocks(title="Video Search Agent", theme=gr.themes.Soft()) as demo:
|
|
| 656 |
# Launch the application
|
| 657 |
if __name__ == "__main__":
|
| 658 |
print("π Starting Video Search Agent...")
|
| 659 |
-
print("π Using CLIP model for embeddings:", app_config.CLIP_MODEL_NAME)
|
| 660 |
|
| 661 |
demo.launch(
|
| 662 |
-
server_name="
|
| 663 |
server_port=7860,
|
| 664 |
-
|
| 665 |
)
|
|
|
|
| 11 |
from utils import get_text_embedding, sample_from_video, convert_image_to_base64
|
| 12 |
from config import load_config
|
| 13 |
from lancedb_utils import retreive_clip
|
| 14 |
+
from gradio import ChatMessage
|
| 15 |
|
| 16 |
app_config = load_config()
|
| 17 |
langchain_message_history = []
|
|
|
|
| 174 |
list: list of frames
|
| 175 |
"""
|
| 176 |
print("clip id", clip_id)
|
| 177 |
+
clip = retreive_clip(clip_id, app_config.LANCEDB_URI.get_secret_value())
|
| 178 |
images = sample_from_video(clip["clip_path"])
|
| 179 |
+
base64_images = [convert_image_to_base64(image, "png") for image in images]
|
| 180 |
return base64_images
|
| 181 |
|
| 182 |
def search_and_display_clips(query_text):
|
|
|
|
| 211 |
# Add current message
|
| 212 |
langchain_message_history.append({"role": "user", "content": message})
|
| 213 |
|
| 214 |
+
llm_with_tool = chat_model.bind_tools(tools=[get_relevant_clips])
|
| 215 |
tools = {"get_relevant_clips": get_relevant_clips}
|
| 216 |
|
| 217 |
# The agent loop
|
|
|
|
| 244 |
global latest_search_results, langchain_message_history
|
| 245 |
|
| 246 |
langchain_message_history.append({"role": "user", "content": message})
|
| 247 |
+
history.append({"role": "user", "content": message})
|
| 248 |
|
| 249 |
print(langchain_message_history)
|
| 250 |
llm_with_tool = chat_model_vlm.bind_tools(tools=[get_relevant_clips, get_clip])
|
|
|
|
| 259 |
|
| 260 |
for tool_call in ai_response.tool_calls:
|
| 261 |
print(tool_call)
|
| 262 |
+
langchain_message_history.append(
|
| 263 |
+
{
|
| 264 |
+
"role": "assistant",
|
| 265 |
+
"content": "",
|
| 266 |
+
"tool_calls": [
|
| 267 |
+
tool_call
|
| 268 |
+
]
|
| 269 |
+
}
|
| 270 |
+
)
|
| 271 |
+
history.append(
|
| 272 |
+
{
|
| 273 |
+
"role": "assistant",
|
| 274 |
+
"content": "",
|
| 275 |
+
"tool_calls": [
|
| 276 |
+
tool_call
|
| 277 |
+
]
|
| 278 |
+
}
|
| 279 |
+
)
|
| 280 |
tool_output = tools[tool_call["name"]].invoke(tool_call)
|
| 281 |
if tool_call["name"] == "get_clip":
|
| 282 |
+
tool_call_log = {
|
| 283 |
+
"role": "tool",
|
| 284 |
+
"tool_call_id": tool_output.tool_call_id,
|
| 285 |
+
"content": "retrieved clip will be provided by the user after this message"
|
| 286 |
+
}
|
| 287 |
+
history.append(tool_call_log)
|
| 288 |
+
langchain_message_history.extend([
|
| 289 |
+
tool_call_log,
|
| 290 |
+
{
|
| 291 |
+
"role": "user", "content": [
|
| 292 |
+
{"type": "text", "text": "here is the clip retreived by the tool"},
|
| 293 |
+
*map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/png;base64,{x}'}}, tool_output.content)
|
| 294 |
+
],
|
| 295 |
+
}])
|
| 296 |
else:
|
| 297 |
tool_call_log = {
|
| 298 |
"role": "tool",
|
|
|
|
| 300 |
"content": tool_output.content
|
| 301 |
}
|
| 302 |
langchain_message_history.append(tool_call_log)
|
| 303 |
+
history.append(tool_call_log)
|
| 304 |
|
| 305 |
content = ai_response.content
|
| 306 |
if "</think>" in content:
|
|
|
|
| 309 |
# The global state `latest_search_results` is updated by the tool.
|
| 310 |
# The text response is returned.
|
| 311 |
langchain_message_history.append({"role": "assistant", "content": content})
|
| 312 |
+
history.append({"role": "assistant", "content": content})
|
| 313 |
+
return history
|
| 314 |
|
| 315 |
def get_latest_clips_for_display():
|
| 316 |
"""Get the latest search results for display in the UI."""
|
|
|
|
| 658 |
original_filename = "uploaded_video.mp4"
|
| 659 |
temp_dir = tempfile.mkdtemp()
|
| 660 |
tmp_path = os.path.join(temp_dir, original_filename)
|
| 661 |
+
with open(tmp_path, "wb") as f:
|
| 662 |
+
f.write(file_obj)
|
| 663 |
|
| 664 |
# Run the video processing pipeline
|
| 665 |
run_pipeline(tmp_path)
|
|
|
|
| 673 |
|
| 674 |
return f"β
Video analysis complete for '{original_filename}'. You can now search for clips from this video."
|
| 675 |
except Exception as e:
|
|
|
|
| 676 |
return f"β Error during video analysis: {str(e)}"
|
| 677 |
|
| 678 |
analyze_btn.click(
|
|
|
|
| 684 |
# Launch the application
|
| 685 |
if __name__ == "__main__":
|
| 686 |
print("π Starting Video Search Agent...")
|
|
|
|
| 687 |
|
| 688 |
demo.launch(
|
| 689 |
+
server_name="localhost",
|
| 690 |
server_port=7860,
|
| 691 |
+
share=False,
|
| 692 |
)
|
utils.py
CHANGED
|
@@ -5,6 +5,7 @@ import io
|
|
| 5 |
import base64
|
| 6 |
from PIL import Image
|
| 7 |
from typing import List, Union
|
|
|
|
| 8 |
import cv2
|
| 9 |
|
| 10 |
def create_directory(directory):
|
|
@@ -114,28 +115,75 @@ def sample_from_video(video_path: str, sampling_rate=0.5) -> list[Image.Image]:
|
|
| 114 |
Args:
|
| 115 |
video_path (str): path to video
|
| 116 |
sampling_rate (float): frames per second, how many frames to take from each second
|
|
|
|
|
|
|
| 117 |
|
| 118 |
Returns:
|
| 119 |
list[Image.Image]: a list of PIL images
|
| 120 |
"""
|
|
|
|
| 121 |
video = cv2.VideoCapture(video_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 123 |
fps = video.get(cv2.CAP_PROP_FPS)
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
images = []
|
| 127 |
|
| 128 |
-
while
|
| 129 |
-
video.set(cv2.CAP_PROP_POS_FRAMES,
|
| 130 |
-
success,
|
|
|
|
| 131 |
if not success:
|
|
|
|
|
|
|
|
|
|
| 132 |
break
|
| 133 |
-
_, buffer = cv2.imencode(".jpg", frame)
|
| 134 |
-
images.append(Image.fromarray(cv2.cvtColor(buffer, cv2.COLOR_BGR2RGB)))
|
| 135 |
-
curr_frame += frames_to_skip
|
| 136 |
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
return images
|
| 140 |
|
| 141 |
def convert_base64_to_image(base64_image: str) -> Image.Image:
|
|
|
|
| 5 |
import base64
|
| 6 |
from PIL import Image
|
| 7 |
from typing import List, Union
|
| 8 |
+
import uuid
|
| 9 |
import cv2
|
| 10 |
|
| 11 |
def create_directory(directory):
|
|
|
|
| 115 |
Args:
|
| 116 |
video_path (str): path to video
|
| 117 |
sampling_rate (float): frames per second, how many frames to take from each second
|
| 118 |
+
e.g., 0.5 means take 1 frame every 2 seconds.
|
| 119 |
+
e.g., 2 means take 2 frames every 1 second.
|
| 120 |
|
| 121 |
Returns:
|
| 122 |
list[Image.Image]: a list of PIL images
|
| 123 |
"""
|
| 124 |
+
print(f"Attempting to open video: {video_path}")
|
| 125 |
video = cv2.VideoCapture(video_path)
|
| 126 |
+
|
| 127 |
+
if not video.isOpened():
|
| 128 |
+
print(f"Error: Could not open video {video_path}")
|
| 129 |
+
return []
|
| 130 |
+
|
| 131 |
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 132 |
fps = video.get(cv2.CAP_PROP_FPS)
|
| 133 |
+
|
| 134 |
+
if fps == 0: # Handle cases where FPS might not be readable or is zero
|
| 135 |
+
print(f"Error: Video FPS is {fps}. Cannot calculate sampling.")
|
| 136 |
+
video.release()
|
| 137 |
+
return []
|
| 138 |
+
|
| 139 |
+
if sampling_rate <= 0:
|
| 140 |
+
print(f"Error: sampling_rate ({sampling_rate}) must be positive.")
|
| 141 |
+
video.release()
|
| 142 |
+
return []
|
| 143 |
+
|
| 144 |
+
# Calculate the frame interval.
|
| 145 |
+
# If sampling_rate is 0.5 FPS (1 frame every 2s) and video is 30 FPS,
|
| 146 |
+
# interval = 30 / 0.5 = 60. So, take frame 0, 60, 120...
|
| 147 |
+
# If sampling_rate is 2 FPS (2 frames every 1s) and video is 30 FPS,
|
| 148 |
+
# interval = 30 / 2 = 15. So, take frame 0, 15, 30...
|
| 149 |
+
frame_interval = round(fps / sampling_rate)
|
| 150 |
+
# Ensure we always advance at least one frame to avoid infinite loops if fps/sampling_rate is too small
|
| 151 |
+
frame_interval = max(1, int(frame_interval))
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
print(f"Video Info - Total Frames: {total_frames}, FPS: {fps:.2f}, Desired Sample Rate: {sampling_rate} fps")
|
| 155 |
+
print(f"Calculated frame interval: Take 1 frame every {frame_interval} original frames.")
|
| 156 |
+
|
| 157 |
+
current_frame_pos = 0
|
| 158 |
images = []
|
| 159 |
|
| 160 |
+
while current_frame_pos < total_frames:
|
| 161 |
+
video.set(cv2.CAP_PROP_POS_FRAMES, current_frame_pos)
|
| 162 |
+
success, frame_bgr = video.read() # frame_bgr is a NumPy array in BGR format
|
| 163 |
+
|
| 164 |
if not success:
|
| 165 |
+
# This might happen if we try to seek beyond the last valid frame
|
| 166 |
+
# or if there's a read error.
|
| 167 |
+
print(f"Warning: Failed to read frame at position {current_frame_pos}. Ending capture.")
|
| 168 |
break
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
+
# Convert the BGR frame to RGB for PIL
|
| 171 |
+
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 172 |
+
|
| 173 |
+
# Create a PIL Image from the RGB NumPy array
|
| 174 |
+
image = Image.fromarray(frame_rgb)
|
| 175 |
+
|
| 176 |
+
# If you want to display/save for debugging:
|
| 177 |
+
# image.show(title=f"Frame {current_frame_pos}") # Displays the image
|
| 178 |
+
# image.save(f"debug_frame_{current_frame_pos}.png") # Saves the image
|
| 179 |
|
| 180 |
+
images.append(image)
|
| 181 |
+
# print(f"Captured frame {current_frame_pos}")
|
| 182 |
+
|
| 183 |
+
current_frame_pos += frame_interval
|
| 184 |
+
|
| 185 |
+
video.release()
|
| 186 |
+
print(f"Successfully sampled {len(images)} images.")
|
| 187 |
return images
|
| 188 |
|
| 189 |
def convert_base64_to_image(base64_image: str) -> Image.Image:
|