juanmackie commited on
Commit
08e1931
·
verified ·
1 Parent(s): 855eb92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -65
app.py CHANGED
@@ -3,42 +3,23 @@ import torch
3
  import numpy as np
4
  import cv2
5
  from PIL import Image
6
-
7
- # Import the DepthAnythingV2 model directly from its module.
8
- # IMPORTANT: This assumes you have the Depth-Anything-V2 repository cloned
9
- # and its 'depth_anything_v2' module is accessible in your Python path.
10
- # You will also need the model checkpoint in a 'checkpoints' folder.
11
- from depth_anything_v2.dpt import DepthAnythingV2
12
 
13
  # Determine the device for model inference (CUDA if available, otherwise MPS/CPU)
14
  DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
15
 
16
- # Model configuration for Depth Anything V2 (using 'vitl' by default)
17
- model_configs = {
18
- 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
19
- 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
20
- 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
21
- 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
22
- }
23
- encoder = 'vitl' # You can change this to 'vits', 'vitb', or 'vitg'
24
-
25
- # Load the pre-trained Depth Anything V2 model
26
- # This requires the specific .pth checkpoint file to be present.
27
  try:
28
- model = DepthAnythingV2(**model_configs[encoder])
29
- # The checkpoint path should be relative to where your app.py is or an absolute path
30
- state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu")
31
- model.load_state_dict(state_dict)
32
- model = model.to(DEVICE).eval()
33
- print(f"Depth Anything V2 ({encoder}) model loaded successfully on {DEVICE}.")
34
- except FileNotFoundError:
35
- print(f"Error: Checkpoint file 'checkpoints/depth_anything_v2_{encoder}.pth' not found.")
36
- print("Please ensure you have downloaded the Depth Anything V2 model checkpoints")
37
- print("and placed them in a 'checkpoints' folder. Refer to the setup instructions below.")
38
- model = None # Set model to None to gracefully handle if it couldn't be loaded
39
  except Exception as e:
40
- print(f"An error occurred while loading the Depth Anything V2 model: {e}")
41
- model = None
42
 
43
 
44
  def process_image(image, max_disparity_ratio, inpaint_radius):
@@ -46,85 +27,95 @@ def process_image(image, max_disparity_ratio, inpaint_radius):
46
  Convert a 2D photo to a stereoscopic 3D image pair using Depth Anything V2
47
  for depth estimation and DIBR, with adjustable parameters.
48
  """
49
- if model is None:
50
- # If model failed to load, return an error image to the UI
51
  print("Error: Depth Anything V2 model not loaded. Cannot process image.")
 
52
  return Image.new('RGB', (200, 200), color = 'red')
53
 
54
- # Convert PIL image to numpy array
55
  image_np = np.array(image)
56
  height, width = image_np.shape[:2]
57
 
58
- # Convert RGB (PIL default) to BGR (OpenCV default, expected by Depth Anything V2's infer_image)
59
- image_np_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
60
-
61
- # Step 1: Estimate the depth map using Depth Anything V2
62
- # The infer_image method handles its own preprocessing.
63
- # The default input_size for Depth Anything V2 is 518.
64
- with torch.no_grad():
65
- depth_map = model.infer_image(image_np_bgr, input_size=518) # HxW raw depth map in numpy
66
-
67
- # Normalize the depth map to [0,1] for consistent disparity calculation
 
 
 
 
 
68
  if depth_map.max() - depth_map.min() > 0:
69
  depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
70
  else:
71
- # Handle case of a flat depth map (all values are the same)
72
  depth_map = np.zeros_like(depth_map)
73
 
74
- # Smooth the depth map to reduce noise, which helps prevent artifacts during shifting
75
  depth_map = cv2.GaussianBlur(depth_map, (5, 5), 0)
76
 
77
  # Step 2: Calculate the disparity map
78
- # Disparity is the pixel shift amount; it's inversely related to depth (closer objects shift more).
 
79
  max_disparity_pixels = int(max_disparity_ratio * width)
80
- # Invert the depth_map values (0 for closest, 1 for furthest after normalization)
81
- # so that closer objects have a larger disparity value.
82
  disparity_map = max_disparity_pixels * (1 - depth_map)
83
 
84
- # Step 3: Initialize left and right images and masks for DIBR (Depth Image Based Rendering)
85
- # These will hold the shifted pixel data and mark unfilled areas.
 
86
  left_image = np.zeros_like(image_np)
87
  right_image = np.zeros_like(image_np)
88
  left_mask = np.ones((height, width), dtype=bool) # True means "hole"
89
  right_mask = np.ones((height, width), dtype=bool) # True means "hole"
90
 
91
- # Step 4: Perform pixel shifting (forward warping)
92
- # Iterate through each pixel of the original image to create the stereo views.
 
93
  for y in range(height):
94
  for x in range(width):
95
  disparity = int(disparity_map[y, x])
96
 
97
- # For the left eye view, shift pixels to the right
98
  new_x_left = x + disparity
99
- # For the right eye view, shift pixels to the left
100
  new_x_right = x - disparity
101
 
102
- # Place the pixel in the left image if the new position is within bounds
103
  if 0 <= new_x_left < width:
104
  left_image[y, new_x_left] = image_np[y, x]
105
- left_mask[y, new_x_left] = False # Mark as filled
106
 
107
- # Place the pixel in the right image if the new position is within bounds
108
  if 0 <= new_x_right < width:
109
  right_image[y, new_x_right] = image_np[y, x]
110
- right_mask[y, new_x_right] = False # Mark as filled
111
 
112
- # Convert the boolean masks to uint8 format (0 or 255) for OpenCV's inpainting function
113
  left_mask_uint8 = left_mask.astype(np.uint8) * 255
114
  right_mask_uint8 = right_mask.astype(np.uint8) * 255
115
 
116
- # Step 5: Apply inpainting to fill the newly created "holes" or disoccluded regions
 
117
  left_image_inpaint = cv2.inpaint(left_image, left_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
118
  right_image_inpaint = cv2.inpaint(right_image, right_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
119
 
120
- # Step 6: Combine the left and right views horizontally to create the final stereoscopic image
121
  stereo_image = np.hstack((left_image_inpaint, right_image_inpaint))
122
 
123
- # Convert the final NumPy array to a PIL Image for Gradio display
124
  stereo_image_pil = Image.fromarray(stereo_image)
125
  return stereo_image_pil
126
 
127
- # Define the Gradio web interface layout
128
  with gr.Blocks() as demo:
129
  gr.Markdown(
130
  """
@@ -159,13 +150,15 @@ with gr.Blocks() as demo:
159
  with gr.Column():
160
  output_image = gr.Image(type="pil", label="Stereoscopic 3D Output (Side-by-Side)")
161
 
162
- # Connect the button click event to the `process_image` function
 
163
  process_button.click(
164
  fn=process_image,
165
  inputs=[input_image, max_disparity_slider, inpaint_radius_slider],
166
  outputs=output_image
167
  )
168
 
169
- # This block is for local execution. Hugging Face Spaces typically handles the launch.
 
170
  if __name__ == '__main__':
171
  demo.launch()
 
3
  import numpy as np
4
  import cv2
5
  from PIL import Image
6
+ # Import the pipeline from transformers
7
+ from transformers import pipeline
 
 
 
 
8
 
9
  # Determine the device for model inference (CUDA if available, otherwise MPS/CPU)
10
  DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
11
 
12
+ # Load the Depth Anything V2 model using the Hugging Face transformers pipeline
13
+ # We'll use the 'Small-hf' version for a balance of performance and speed,
14
+ # but you can change it to 'Base-hf' or 'Large-hf' if you prefer.
 
 
 
 
 
 
 
 
15
  try:
16
+ # Initialize the depth estimation pipeline
17
+ # device=0 targets the first GPU (if DEVICE is 'cuda'), otherwise -1 for CPU.
18
+ depth_estimator = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf", device=0 if DEVICE == 'cuda' else -1)
19
+ print(f"Depth Anything V2 (Small-hf) model loaded successfully on {DEVICE}.")
 
 
 
 
 
 
 
20
  except Exception as e:
21
+ print(f"An error occurred while loading the Depth Anything V2 model via transformers: {e}")
22
+ depth_estimator = None
23
 
24
 
25
  def process_image(image, max_disparity_ratio, inpaint_radius):
 
27
  Convert a 2D photo to a stereoscopic 3D image pair using Depth Anything V2
28
  for depth estimation and DIBR, with adjustable parameters.
29
  """
30
+ if depth_estimator is None:
31
+ # If model failed to load, return an error image or message
32
  print("Error: Depth Anything V2 model not loaded. Cannot process image.")
33
+ # Create a blank red image to signal an error in the UI
34
  return Image.new('RGB', (200, 200), color = 'red')
35
 
36
+ # Convert PIL image to numpy array for DIBR processing later
37
  image_np = np.array(image)
38
  height, width = image_np.shape[:2]
39
 
40
+ # Step 1: Estimate the depth map using Depth Anything V2 via transformers pipeline
41
+ # The pipeline directly takes a PIL Image and returns a dictionary.
42
+ # The 'depth' key contains the predicted depth map as a PIL Image (grayscale).
43
+ try:
44
+ depth_pil = depth_estimator(image)["depth"]
45
+ # Convert the PIL depth image to a numpy array for further processing
46
+ depth_map = np.array(depth_pil)
47
+ except Exception as e:
48
+ print(f"Error during depth estimation: {e}")
49
+ # Return an orange image to indicate a depth estimation specific error
50
+ return Image.new('RGB', (200, 200), color = 'orange')
51
+
52
+ # Normalize the depth map to [0,1]
53
+ # This is crucial for consistent disparity calculation regardless of raw depth values.
54
+ # Check to prevent division by zero if all depth values are the same.
55
  if depth_map.max() - depth_map.min() > 0:
56
  depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
57
  else:
58
+ # If the depth map is flat, treat it as all at the same "depth"
59
  depth_map = np.zeros_like(depth_map)
60
 
61
+ # Smooth the depth map to reduce noise and artifacts in the shifted images
62
  depth_map = cv2.GaussianBlur(depth_map, (5, 5), 0)
63
 
64
  # Step 2: Calculate the disparity map
65
+ # Disparity is inversely proportional to depth. Closer objects have higher disparity.
66
+ # max_disparity_pixels is calculated based on a ratio of the image width.
67
  max_disparity_pixels = int(max_disparity_ratio * width)
68
+ # We invert the depth_map because smaller depth values typically mean closer objects
69
+ # and thus should have larger disparity for the stereo effect.
70
  disparity_map = max_disparity_pixels * (1 - depth_map)
71
 
72
+ # Step 3: Initialize left and right images and masks for DIBR
73
+ # These arrays will store the pixel data for the left and right eye views.
74
+ # Masks track which pixels have been filled, helping identify "holes".
75
  left_image = np.zeros_like(image_np)
76
  right_image = np.zeros_like(image_np)
77
  left_mask = np.ones((height, width), dtype=bool) # True means "hole"
78
  right_mask = np.ones((height, width), dtype=bool) # True means "hole"
79
 
80
+ # Step 4: Perform pixel shifting based on disparity (forward warping)
81
+ # Iterate through each pixel of the original image and place it into the
82
+ # new left and right views based on its calculated disparity.
83
  for y in range(height):
84
  for x in range(width):
85
  disparity = int(disparity_map[y, x])
86
 
87
+ # For the left eye, pixels are typically shifted to the right
88
  new_x_left = x + disparity
89
+ # For the right eye, pixels are typically shifted to the left
90
  new_x_right = x - disparity
91
 
92
+ # Place pixel in left image if within valid horizontal bounds
93
  if 0 <= new_x_left < width:
94
  left_image[y, new_x_left] = image_np[y, x]
95
+ left_mask[y, new_x_left] = False # Mark this target pixel as filled
96
 
97
+ # Place pixel in right image if within valid horizontal bounds
98
  if 0 <= new_x_right < width:
99
  right_image[y, new_x_right] = image_np[y, x]
100
+ right_mask[y, new_x_right] = False # Mark this target pixel as filled
101
 
102
+ # Convert masks to uint8 format (0 for filled, 255 for hole) for OpenCV's inpainting function
103
  left_mask_uint8 = left_mask.astype(np.uint8) * 255
104
  right_mask_uint8 = right_mask.astype(np.uint8) * 255
105
 
106
+ # Step 5: Apply inpainting to fill holes (disoccluded regions)
107
+ # The `cv2.INPAINT_TELEA` algorithm is generally effective for this task.
108
  left_image_inpaint = cv2.inpaint(left_image, left_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
109
  right_image_inpaint = cv2.inpaint(right_image, right_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
110
 
111
+ # Step 6: Combine the processed left and right images into a single side-by-side stereoscopic image
112
  stereo_image = np.hstack((left_image_inpaint, right_image_inpaint))
113
 
114
+ # Convert the final numpy array back to a PIL Image, which Gradio uses for display
115
  stereo_image_pil = Image.fromarray(stereo_image)
116
  return stereo_image_pil
117
 
118
+ # Define the Gradio web interface layout and components
119
  with gr.Blocks() as demo:
120
  gr.Markdown(
121
  """
 
150
  with gr.Column():
151
  output_image = gr.Image(type="pil", label="Stereoscopic 3D Output (Side-by-Side)")
152
 
153
+ # Connect the button click event to the image processing function.
154
+ # When the button is clicked, `process_image` is called with the slider values.
155
  process_button.click(
156
  fn=process_image,
157
  inputs=[input_image, max_disparity_slider, inpaint_radius_slider],
158
  outputs=output_image
159
  )
160
 
161
+ # This block is executed when the script is run directly (e.g., for local testing).
162
+ # Hugging Face Spaces typically runs the app via its own internal mechanisms.
163
  if __name__ == '__main__':
164
  demo.launch()