youssef
commited on
Commit
·
c0d1640
1
Parent(s):
abf26d0
optimize
Browse files- src/video_processor/processor.py +40 -38
src/video_processor/processor.py
CHANGED
|
@@ -55,34 +55,40 @@ class VideoAnalyzer:
|
|
| 55 |
self.model_path,
|
| 56 |
torch_dtype=torch.bfloat16,
|
| 57 |
device_map=DEVICE,
|
| 58 |
-
_attn_implementation="flash_attention_2"
|
|
|
|
| 59 |
).to(DEVICE)
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
def analyze_segment(self, video_path: str, start_time: float) -> str:
|
| 63 |
"""Analyze a single video segment."""
|
| 64 |
messages = [
|
| 65 |
{
|
| 66 |
"role": "system",
|
| 67 |
-
"content": [{"type": "text", "text": """You are a detailed video analysis assistant
|
| 68 |
-
1.
|
| 69 |
-
2.
|
| 70 |
-
3.
|
| 71 |
-
4.
|
| 72 |
-
5.
|
| 73 |
-
|
|
|
|
| 74 |
},
|
| 75 |
{
|
| 76 |
"role": "user",
|
| 77 |
"content": [
|
| 78 |
{"type": "video", "path": video_path},
|
| 79 |
-
{"type": "text", "text": """Describe this
|
| 80 |
-
-
|
| 81 |
-
- What
|
| 82 |
-
- What
|
| 83 |
-
-
|
| 84 |
-
- What
|
| 85 |
-
|
|
|
|
| 86 |
]
|
| 87 |
}
|
| 88 |
]
|
|
@@ -95,12 +101,13 @@ Be specific about visual details but stay concise."""}
|
|
| 95 |
return_tensors="pt"
|
| 96 |
).to(DEVICE, dtype=torch.bfloat16)
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
| 104 |
return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
|
| 105 |
|
| 106 |
def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
|
|
@@ -126,24 +133,19 @@ Be specific about visual details but stay concise."""}
|
|
| 126 |
# Create segment - Optimized ffmpeg settings
|
| 127 |
segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
|
| 128 |
cmd = [
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
"-vf", "scale=640:-2", # Resize to smaller resolution
|
| 140 |
-
"-an", # Remove audio
|
| 141 |
-
"-pix_fmt", "yuv420p",
|
| 142 |
-
segment_path
|
| 143 |
-
]
|
| 144 |
|
| 145 |
ffmpeg_start = time.time()
|
| 146 |
-
subprocess.run(cmd, check=True
|
| 147 |
ffmpeg_time = time.time() - ffmpeg_start
|
| 148 |
|
| 149 |
# Analyze segment
|
|
|
|
| 55 |
self.model_path,
|
| 56 |
torch_dtype=torch.bfloat16,
|
| 57 |
device_map=DEVICE,
|
| 58 |
+
_attn_implementation="flash_attention_2",
|
| 59 |
+
low_cpu_mem_usage=True,
|
| 60 |
).to(DEVICE)
|
| 61 |
+
|
| 62 |
+
# Compile model for faster inference
|
| 63 |
+
self.model = torch.compile(self.model, mode="reduce-overhead")
|
| 64 |
+
logger.info(f"Model loaded and compiled on device: {self.model.device}")
|
| 65 |
|
| 66 |
def analyze_segment(self, video_path: str, start_time: float) -> str:
|
| 67 |
"""Analyze a single video segment."""
|
| 68 |
messages = [
|
| 69 |
{
|
| 70 |
"role": "system",
|
| 71 |
+
"content": [{"type": "text", "text": """You are a detailed video analysis assistant. Analyze and describe:
|
| 72 |
+
1. People: their appearance, actions, and interactions
|
| 73 |
+
2. Environment: location, weather, time of day, lighting
|
| 74 |
+
3. Objects: key items, their positions and movements
|
| 75 |
+
4. Text: any visible text, signs, or captions
|
| 76 |
+
5. Events: what is happening in sequence
|
| 77 |
+
6. Visual details: colors, patterns, visual effects
|
| 78 |
+
Be specific about timing and details to enable searching through the video later."""}]
|
| 79 |
},
|
| 80 |
{
|
| 81 |
"role": "user",
|
| 82 |
"content": [
|
| 83 |
{"type": "video", "path": video_path},
|
| 84 |
+
{"type": "text", "text": """Describe this segment comprehensively. Include:
|
| 85 |
+
- Who appears and what are they doing?
|
| 86 |
+
- What is the environment and weather like?
|
| 87 |
+
- What objects or items are visible?
|
| 88 |
+
- Is there any text visible on screen?
|
| 89 |
+
- What actions or events are occurring?
|
| 90 |
+
- Note any significant visual details
|
| 91 |
+
Be specific about all visual elements to enable searching later."""}
|
| 92 |
]
|
| 93 |
}
|
| 94 |
]
|
|
|
|
| 101 |
return_tensors="pt"
|
| 102 |
).to(DEVICE, dtype=torch.bfloat16)
|
| 103 |
|
| 104 |
+
with torch.inference_mode():
|
| 105 |
+
outputs = self.model.generate(
|
| 106 |
+
**inputs,
|
| 107 |
+
do_sample=False,
|
| 108 |
+
temperature=0.7,
|
| 109 |
+
max_new_tokens=256,
|
| 110 |
+
)
|
| 111 |
return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
|
| 112 |
|
| 113 |
def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
|
|
|
|
| 133 |
# Create segment - Optimized ffmpeg settings
|
| 134 |
segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
|
| 135 |
cmd = [
|
| 136 |
+
"ffmpeg",
|
| 137 |
+
"-y",
|
| 138 |
+
"-i", video_path,
|
| 139 |
+
"-ss", str(start_time),
|
| 140 |
+
"-t", str(segment_length),
|
| 141 |
+
"-c:v", "libx264",
|
| 142 |
+
"-preset", "ultrafast", # Use ultrafast preset for speed
|
| 143 |
+
"-pix_fmt", "yuv420p", # Ensure compatible pixel format
|
| 144 |
+
segment_path
|
| 145 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
ffmpeg_start = time.time()
|
| 148 |
+
subprocess.run(cmd, check=True)
|
| 149 |
ffmpeg_time = time.time() - ffmpeg_start
|
| 150 |
|
| 151 |
# Analyze segment
|