Simultaneous-Segmented-Depth-Prediction

Sleeping

App Files Files Community

Alessio Grancini commited on Feb 9

Commit

2609a96

verified ·

1 Parent(s): 09543a7

Update monocular_depth_estimator.py

Browse files

Files changed (1) hide show

monocular_depth_estimator.py +61 -46

monocular_depth_estimator.py CHANGED Viewed

@@ -17,54 +17,58 @@ MODEL_FILE_URL = {
 class MonocularDepthEstimator:
     def __init__(self,
-    model_type="midas_v21_small_256",
-    model_weights_path="models/",
-    optimize=False,
-    side_by_side=False,
-    height=None,
-    square=False,
-    grayscale=False):
-    # Store parameters but don't initialize CUDA
-    self.model_type = model_type
-    self.model_weights_path = model_weights_path
-    self.is_optimize = optimize
-    self.is_square = square
-    self.is_grayscale = grayscale
-    self.height = height
-    self.side_by_side = side_by_side
-    self.model = None  # Model will be loaded in make_prediction
-    self.transform = None
-        # Download model if not exists
         if not os.path.exists(model_weights_path+model_type+".pt"):
             print("Model file not found. Downloading...")
             urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
             print("Model file downloaded successfully.")
     def load_model_if_needed(self):
-        """Load model if not already loaded"""
         if self.model is None:
             self.model, self.transform, self.net_w, self.net_h = load_model(
-                self.device,
                 self.model_weights_path + self.model_type + ".pt",
-                self.model_type,
-                self.is_optimize,
-                self.height,
                 self.is_square
             )
             print("Net width and height: ", (self.net_w, self.net_h))
     @spaces.GPU
     def predict(self, image, target_size):
-        """GPU-accelerated prediction"""
         # Load model if not loaded
         self.load_model_if_needed()
         # convert img to tensor and load to gpu
-        img_tensor = torch.from_numpy(image).to(self.device).unsqueeze(0)
-        if self.is_optimize and self.device == torch.device("cuda"):
             img_tensor = img_tensor.to(memory_format=torch.channels_last)
             img_tensor = img_tensor.half()
@@ -84,7 +88,7 @@ class MonocularDepthEstimator:
         return prediction
     def process_prediction(self, depth_map):
-        """Process prediction (CPU operation, no GPU needed)"""
         depth_min = depth_map.min()
         depth_max = depth_map.max()
         normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
@@ -94,25 +98,32 @@ class MonocularDepthEstimator:
         return normalized_depth/255, depth_colormap/255
-    @spaces.GPU(duration=30)
     def make_prediction(self, image):
-        """Main prediction function with GPU acceleration"""
         image = image.copy()
-        with torch.no_grad():
-            original_image_rgb = np.flip(image, 2)  # in [0, 255] (flip required to get RGB)
-            # resizing the image to feed to the model
-            image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
-            # monocular depth prediction
-            pred = self.predict(image_tranformed, target_size=original_image_rgb.shape[1::-1])
-            # process the model predictions
-            depthmap, depth_colormap = self.process_prediction(pred)
-        return depthmap, depth_colormap
-    @spaces.GPU(duration=60)
     def run(self, input_path):
-        """Video processing with GPU acceleration"""
         cap = cv2.VideoCapture(input_path)
         if not cap.isOpened():
@@ -139,6 +150,10 @@ class MonocularDepthEstimator:
         cap.release()
         cv2.destroyAllWindows()
-if __name__ == "__main__":
     depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
-    depth_estimator.run("assets/videos/testvideo2.mp4")

 class MonocularDepthEstimator:
     def __init__(self,
+        model_type="midas_v21_small_256",
+        model_weights_path="models/",
+        optimize=False,
+        side_by_side=False,
+        height=None,
+        square=False,
+        grayscale=False):
+        # Store parameters but don't initialize CUDA
+        self.model_type = model_type
+        self.model_weights_path = model_weights_path
+        self.is_optimize = optimize
+        self.is_square = square
+        self.is_grayscale = grayscale
+        self.height = height
+        self.side_by_side = side_by_side
+        self.model = None
+        self.transform = None
+        self.net_w = None
+        self.net_h = None
+        print("Initializing parameters...")
+        # Download model if needed
         if not os.path.exists(model_weights_path+model_type+".pt"):
             print("Model file not found. Downloading...")
             urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
             print("Model file downloaded successfully.")
     def load_model_if_needed(self):
         if self.model is None:
+            print("Loading MiDaS model...")
             self.model, self.transform, self.net_w, self.net_h = load_model(
+                'cuda',
                 self.model_weights_path + self.model_type + ".pt",
+                self.model_type,
+                self.is_optimize,
+                self.height,
                 self.is_square
             )
+            print("Model loaded successfully")
             print("Net width and height: ", (self.net_w, self.net_h))
     @spaces.GPU
     def predict(self, image, target_size):
         # Load model if not loaded
         self.load_model_if_needed()
         # convert img to tensor and load to gpu
+        img_tensor = torch.from_numpy(image).to('cuda').unsqueeze(0)
+        if self.is_optimize:
             img_tensor = img_tensor.to(memory_format=torch.channels_last)
             img_tensor = img_tensor.half()
         return prediction
     def process_prediction(self, depth_map):
+        # normalizing depth image
         depth_min = depth_map.min()
         depth_max = depth_map.max()
         normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
         return normalized_depth/255, depth_colormap/255
+    @spaces.GPU
     def make_prediction(self, image):
         image = image.copy()
+        try:
+            print("Starting depth estimation...")
+            with torch.no_grad():
+                original_image_rgb = np.flip(image, 2)  # in [0, 255] (flip required to get RGB)
+                # resizing the image to feed to the model
+                self.load_model_if_needed()
+                image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
+                # monocular depth prediction
+                pred = self.predict(image_tranformed, target_size=original_image_rgb.shape[1::-1])
+                # process the model predictions
+                depthmap, depth_colormap = self.process_prediction(pred)
+            print("Depth estimation complete")
+            return depthmap, depth_colormap
+        except Exception as e:
+            print(f"Error in make_prediction: {str(e)}")
+            import traceback
+            print(traceback.format_exc())
+            raise
+    @spaces.GPU
     def run(self, input_path):
         cap = cv2.VideoCapture(input_path)
         if not cap.isOpened():
         cap.release()
         cv2.destroyAllWindows()
+        if name == "__main__":
     depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
+    depth_estimator.run("assets/videos/testvideo2.mp4")