youssef
commited on
Commit
·
24c2f62
1
Parent(s):
d200533
more logs
Browse files- src/video_processor/processor.py +11 -12
src/video_processor/processor.py
CHANGED
|
@@ -23,20 +23,17 @@ class VideoAnalyzer:
|
|
| 23 |
|
| 24 |
logger.info("Initializing VideoAnalyzer")
|
| 25 |
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 26 |
-
logger.info(f"Loading model from {self.model_path}")
|
| 27 |
|
| 28 |
# Load processor and model
|
| 29 |
-
self.processor = AutoProcessor.from_pretrained(
|
| 30 |
-
self.model_path,
|
| 31 |
-
torch_dtype=torch.bfloat16
|
| 32 |
-
)
|
| 33 |
|
| 34 |
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 35 |
self.model_path,
|
| 36 |
torch_dtype=torch.bfloat16,
|
| 37 |
# _attn_implementation="flash_attention_2"
|
| 38 |
).to(DEVICE)
|
| 39 |
-
logger.info(f"Model loaded on device: {self.model.device}
|
| 40 |
|
| 41 |
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
|
| 42 |
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
|
|
@@ -60,6 +57,8 @@ class VideoAnalyzer:
|
|
| 60 |
]
|
| 61 |
}
|
| 62 |
]
|
|
|
|
|
|
|
| 63 |
|
| 64 |
# Process video using chat template
|
| 65 |
inputs = self.processor.apply_chat_template(
|
|
@@ -68,13 +67,9 @@ class VideoAnalyzer:
|
|
| 68 |
tokenize=True,
|
| 69 |
return_dict=True,
|
| 70 |
return_tensors="pt"
|
| 71 |
-
).to(
|
| 72 |
-
|
| 73 |
-
# Convert inputs to bfloat16 before moving to GPU
|
| 74 |
-
#for key in inputs:
|
| 75 |
-
# if torch.is_tensor(inputs[key]):
|
| 76 |
-
# inputs[key] = inputs[key].to(dtype=torch.bfloat16, device=self.model.device)
|
| 77 |
|
|
|
|
| 78 |
# Generate description with increased token limit
|
| 79 |
generated_ids = self.model.generate(
|
| 80 |
**inputs,
|
|
@@ -82,10 +77,14 @@ class VideoAnalyzer:
|
|
| 82 |
temperature=0.7,
|
| 83 |
max_new_tokens=512 # Increased from 100 to get more detailed descriptions
|
| 84 |
)
|
|
|
|
|
|
|
| 85 |
description = self.processor.batch_decode(
|
| 86 |
generated_ids,
|
| 87 |
skip_special_tokens=True
|
| 88 |
)[0]
|
|
|
|
|
|
|
| 89 |
|
| 90 |
return [{
|
| 91 |
"description": description.split("Assistant: ")[-1] # Remove assistant prefix if present
|
|
|
|
| 23 |
|
| 24 |
logger.info("Initializing VideoAnalyzer")
|
| 25 |
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 26 |
+
logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
|
| 27 |
|
| 28 |
# Load processor and model
|
| 29 |
+
self.processor = AutoProcessor.from_pretrained(self.model_path)
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 32 |
self.model_path,
|
| 33 |
torch_dtype=torch.bfloat16,
|
| 34 |
# _attn_implementation="flash_attention_2"
|
| 35 |
).to(DEVICE)
|
| 36 |
+
logger.info(f"Model loaded on device: {self.model.device}")
|
| 37 |
|
| 38 |
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
|
| 39 |
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
|
|
|
|
| 57 |
]
|
| 58 |
}
|
| 59 |
]
|
| 60 |
+
|
| 61 |
+
logger.info(f"Applying chat template - message: {messages}")
|
| 62 |
|
| 63 |
# Process video using chat template
|
| 64 |
inputs = self.processor.apply_chat_template(
|
|
|
|
| 67 |
tokenize=True,
|
| 68 |
return_dict=True,
|
| 69 |
return_tensors="pt"
|
| 70 |
+
).to(DEVICE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
+
logger.info(f"Generating IDs")
|
| 73 |
# Generate description with increased token limit
|
| 74 |
generated_ids = self.model.generate(
|
| 75 |
**inputs,
|
|
|
|
| 77 |
temperature=0.7,
|
| 78 |
max_new_tokens=512 # Increased from 100 to get more detailed descriptions
|
| 79 |
)
|
| 80 |
+
|
| 81 |
+
logger.info(f"batch decoding...")
|
| 82 |
description = self.processor.batch_decode(
|
| 83 |
generated_ids,
|
| 84 |
skip_special_tokens=True
|
| 85 |
)[0]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
|
| 89 |
return [{
|
| 90 |
"description": description.split("Assistant: ")[-1] # Remove assistant prefix if present
|