Spaces:

juanmackie
/

2DTo3DSpatialPhotoConverter

Sleeping

App Files Files Community

juanmackie commited on Jun 15

Commit

08e1931

verified ·

1 Parent(s): 855eb92

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -65

app.py CHANGED Viewed

@@ -3,42 +3,23 @@ import torch
 import numpy as np
 import cv2
 from PIL import Image
-# Import the DepthAnythingV2 model directly from its module.
-# IMPORTANT: This assumes you have the Depth-Anything-V2 repository cloned
-# and its 'depth_anything_v2' module is accessible in your Python path.
-# You will also need the model checkpoint in a 'checkpoints' folder.
-from depth_anything_v2.dpt import DepthAnythingV2
 # Determine the device for model inference (CUDA if available, otherwise MPS/CPU)
 DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
-# Model configuration for Depth Anything V2 (using 'vitl' by default)
-model_configs = {
-    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
-    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
-    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
-    'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
-}
-encoder = 'vitl' # You can change this to 'vits', 'vitb', or 'vitg'
-# Load the pre-trained Depth Anything V2 model
-# This requires the specific .pth checkpoint file to be present.
 try:
-    model = DepthAnythingV2(**model_configs[encoder])
-    # The checkpoint path should be relative to where your app.py is or an absolute path
-    state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu")
-    model.load_state_dict(state_dict)
-    model = model.to(DEVICE).eval()
-    print(f"Depth Anything V2 ({encoder}) model loaded successfully on {DEVICE}.")
-except FileNotFoundError:
-    print(f"Error: Checkpoint file 'checkpoints/depth_anything_v2_{encoder}.pth' not found.")
-    print("Please ensure you have downloaded the Depth Anything V2 model checkpoints")
-    print("and placed them in a 'checkpoints' folder. Refer to the setup instructions below.")
-    model = None # Set model to None to gracefully handle if it couldn't be loaded
 except Exception as e:
-    print(f"An error occurred while loading the Depth Anything V2 model: {e}")
-    model = None
 def process_image(image, max_disparity_ratio, inpaint_radius):
@@ -46,85 +27,95 @@ def process_image(image, max_disparity_ratio, inpaint_radius):
     Convert a 2D photo to a stereoscopic 3D image pair using Depth Anything V2
     for depth estimation and DIBR, with adjustable parameters.
     """
-    if model is None:
-        # If model failed to load, return an error image to the UI
         print("Error: Depth Anything V2 model not loaded. Cannot process image.")
         return Image.new('RGB', (200, 200), color = 'red')
-    # Convert PIL image to numpy array
     image_np = np.array(image)
     height, width = image_np.shape[:2]
-    # Convert RGB (PIL default) to BGR (OpenCV default, expected by Depth Anything V2's infer_image)
-    image_np_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
-    # Step 1: Estimate the depth map using Depth Anything V2
-    # The infer_image method handles its own preprocessing.
-    # The default input_size for Depth Anything V2 is 518.
-    with torch.no_grad():
-        depth_map = model.infer_image(image_np_bgr, input_size=518) # HxW raw depth map in numpy
-    # Normalize the depth map to [0,1] for consistent disparity calculation
     if depth_map.max() - depth_map.min() > 0:
         depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
     else:
-        # Handle case of a flat depth map (all values are the same)
         depth_map = np.zeros_like(depth_map)
-    # Smooth the depth map to reduce noise, which helps prevent artifacts during shifting
     depth_map = cv2.GaussianBlur(depth_map, (5, 5), 0)
     # Step 2: Calculate the disparity map
-    # Disparity is the pixel shift amount; it's inversely related to depth (closer objects shift more).
     max_disparity_pixels = int(max_disparity_ratio * width)
-    # Invert the depth_map values (0 for closest, 1 for furthest after normalization)
-    # so that closer objects have a larger disparity value.
     disparity_map = max_disparity_pixels * (1 - depth_map)
-    # Step 3: Initialize left and right images and masks for DIBR (Depth Image Based Rendering)
-    # These will hold the shifted pixel data and mark unfilled areas.
     left_image = np.zeros_like(image_np)
     right_image = np.zeros_like(image_np)
     left_mask = np.ones((height, width), dtype=bool) # True means "hole"
     right_mask = np.ones((height, width), dtype=bool) # True means "hole"
-    # Step 4: Perform pixel shifting (forward warping)
-    # Iterate through each pixel of the original image to create the stereo views.
     for y in range(height):
         for x in range(width):
             disparity = int(disparity_map[y, x])
-            # For the left eye view, shift pixels to the right
             new_x_left = x + disparity
-            # For the right eye view, shift pixels to the left
             new_x_right = x - disparity
-            # Place the pixel in the left image if the new position is within bounds
             if 0 <= new_x_left < width:
                 left_image[y, new_x_left] = image_np[y, x]
-                left_mask[y, new_x_left] = False # Mark as filled
-            # Place the pixel in the right image if the new position is within bounds
             if 0 <= new_x_right < width:
                 right_image[y, new_x_right] = image_np[y, x]
-                right_mask[y, new_x_right] = False # Mark as filled
-    # Convert the boolean masks to uint8 format (0 or 255) for OpenCV's inpainting function
     left_mask_uint8 = left_mask.astype(np.uint8) * 255
     right_mask_uint8 = right_mask.astype(np.uint8) * 255
-    # Step 5: Apply inpainting to fill the newly created "holes" or disoccluded regions
     left_image_inpaint = cv2.inpaint(left_image, left_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
     right_image_inpaint = cv2.inpaint(right_image, right_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
-    # Step 6: Combine the left and right views horizontally to create the final stereoscopic image
     stereo_image = np.hstack((left_image_inpaint, right_image_inpaint))
-    # Convert the final NumPy array to a PIL Image for Gradio display
     stereo_image_pil = Image.fromarray(stereo_image)
     return stereo_image_pil
-# Define the Gradio web interface layout
 with gr.Blocks() as demo:
     gr.Markdown(
         """
@@ -159,13 +150,15 @@ with gr.Blocks() as demo:
         with gr.Column():
             output_image = gr.Image(type="pil", label="Stereoscopic 3D Output (Side-by-Side)")
-    # Connect the button click event to the `process_image` function
     process_button.click(
         fn=process_image,
         inputs=[input_image, max_disparity_slider, inpaint_radius_slider],
         outputs=output_image
     )
-# This block is for local execution. Hugging Face Spaces typically handles the launch.
 if __name__ == '__main__':
     demo.launch()

 import numpy as np
 import cv2
 from PIL import Image
+# Import the pipeline from transformers
+from transformers import pipeline
 # Determine the device for model inference (CUDA if available, otherwise MPS/CPU)
 DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+# Load the Depth Anything V2 model using the Hugging Face transformers pipeline
+# We'll use the 'Small-hf' version for a balance of performance and speed,
+# but you can change it to 'Base-hf' or 'Large-hf' if you prefer.
 try:
+    # Initialize the depth estimation pipeline
+    # device=0 targets the first GPU (if DEVICE is 'cuda'), otherwise -1 for CPU.
+    depth_estimator = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf", device=0 if DEVICE == 'cuda' else -1)
+    print(f"Depth Anything V2 (Small-hf) model loaded successfully on {DEVICE}.")
 except Exception as e:
+    print(f"An error occurred while loading the Depth Anything V2 model via transformers: {e}")
+    depth_estimator = None
 def process_image(image, max_disparity_ratio, inpaint_radius):
     Convert a 2D photo to a stereoscopic 3D image pair using Depth Anything V2
     for depth estimation and DIBR, with adjustable parameters.
     """
+    if depth_estimator is None:
+        # If model failed to load, return an error image or message
         print("Error: Depth Anything V2 model not loaded. Cannot process image.")
+        # Create a blank red image to signal an error in the UI
         return Image.new('RGB', (200, 200), color = 'red')
+    # Convert PIL image to numpy array for DIBR processing later
     image_np = np.array(image)
     height, width = image_np.shape[:2]
+    # Step 1: Estimate the depth map using Depth Anything V2 via transformers pipeline
+    # The pipeline directly takes a PIL Image and returns a dictionary.
+    # The 'depth' key contains the predicted depth map as a PIL Image (grayscale).
+    try:
+        depth_pil = depth_estimator(image)["depth"]
+        # Convert the PIL depth image to a numpy array for further processing
+        depth_map = np.array(depth_pil)
+    except Exception as e:
+        print(f"Error during depth estimation: {e}")
+        # Return an orange image to indicate a depth estimation specific error
+        return Image.new('RGB', (200, 200), color = 'orange')
+    # Normalize the depth map to [0,1]
+    # This is crucial for consistent disparity calculation regardless of raw depth values.
+    # Check to prevent division by zero if all depth values are the same.
     if depth_map.max() - depth_map.min() > 0:
         depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
     else:
+        # If the depth map is flat, treat it as all at the same "depth"
         depth_map = np.zeros_like(depth_map)
+    # Smooth the depth map to reduce noise and artifacts in the shifted images
     depth_map = cv2.GaussianBlur(depth_map, (5, 5), 0)
     # Step 2: Calculate the disparity map
+    # Disparity is inversely proportional to depth. Closer objects have higher disparity.
+    # max_disparity_pixels is calculated based on a ratio of the image width.
     max_disparity_pixels = int(max_disparity_ratio * width)
+    # We invert the depth_map because smaller depth values typically mean closer objects
+    # and thus should have larger disparity for the stereo effect.
     disparity_map = max_disparity_pixels * (1 - depth_map)
+    # Step 3: Initialize left and right images and masks for DIBR
+    # These arrays will store the pixel data for the left and right eye views.
+    # Masks track which pixels have been filled, helping identify "holes".
     left_image = np.zeros_like(image_np)
     right_image = np.zeros_like(image_np)
     left_mask = np.ones((height, width), dtype=bool) # True means "hole"
     right_mask = np.ones((height, width), dtype=bool) # True means "hole"
+    # Step 4: Perform pixel shifting based on disparity (forward warping)
+    # Iterate through each pixel of the original image and place it into the
+    # new left and right views based on its calculated disparity.
     for y in range(height):
         for x in range(width):
             disparity = int(disparity_map[y, x])
+            # For the left eye, pixels are typically shifted to the right
             new_x_left = x + disparity
+            # For the right eye, pixels are typically shifted to the left
             new_x_right = x - disparity
+            # Place pixel in left image if within valid horizontal bounds
             if 0 <= new_x_left < width:
                 left_image[y, new_x_left] = image_np[y, x]
+                left_mask[y, new_x_left] = False # Mark this target pixel as filled
+            # Place pixel in right image if within valid horizontal bounds
             if 0 <= new_x_right < width:
                 right_image[y, new_x_right] = image_np[y, x]
+                right_mask[y, new_x_right] = False # Mark this target pixel as filled
+    # Convert masks to uint8 format (0 for filled, 255 for hole) for OpenCV's inpainting function
     left_mask_uint8 = left_mask.astype(np.uint8) * 255
     right_mask_uint8 = right_mask.astype(np.uint8) * 255
+    # Step 5: Apply inpainting to fill holes (disoccluded regions)
+    # The `cv2.INPAINT_TELEA` algorithm is generally effective for this task.
     left_image_inpaint = cv2.inpaint(left_image, left_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
     right_image_inpaint = cv2.inpaint(right_image, right_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
+    # Step 6: Combine the processed left and right images into a single side-by-side stereoscopic image
     stereo_image = np.hstack((left_image_inpaint, right_image_inpaint))
+    # Convert the final numpy array back to a PIL Image, which Gradio uses for display
     stereo_image_pil = Image.fromarray(stereo_image)
     return stereo_image_pil
+# Define the Gradio web interface layout and components
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         with gr.Column():
             output_image = gr.Image(type="pil", label="Stereoscopic 3D Output (Side-by-Side)")
+    # Connect the button click event to the image processing function.
+    # When the button is clicked, `process_image` is called with the slider values.
     process_button.click(
         fn=process_image,
         inputs=[input_image, max_disparity_slider, inpaint_radius_slider],
         outputs=output_image
     )
+# This block is executed when the script is run directly (e.g., for local testing).
+# Hugging Face Spaces typically runs the app via its own internal mechanisms.
 if __name__ == '__main__':
     demo.launch()