Update app.py
Browse files
app.py
CHANGED
|
@@ -3,42 +3,23 @@ import torch
|
|
| 3 |
import numpy as np
|
| 4 |
import cv2
|
| 5 |
from PIL import Image
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
# IMPORTANT: This assumes you have the Depth-Anything-V2 repository cloned
|
| 9 |
-
# and its 'depth_anything_v2' module is accessible in your Python path.
|
| 10 |
-
# You will also need the model checkpoint in a 'checkpoints' folder.
|
| 11 |
-
from depth_anything_v2.dpt import DepthAnythingV2
|
| 12 |
|
| 13 |
# Determine the device for model inference (CUDA if available, otherwise MPS/CPU)
|
| 14 |
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
|
| 15 |
|
| 16 |
-
#
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
|
| 20 |
-
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
|
| 21 |
-
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
|
| 22 |
-
}
|
| 23 |
-
encoder = 'vitl' # You can change this to 'vits', 'vitb', or 'vitg'
|
| 24 |
-
|
| 25 |
-
# Load the pre-trained Depth Anything V2 model
|
| 26 |
-
# This requires the specific .pth checkpoint file to be present.
|
| 27 |
try:
|
| 28 |
-
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
model.
|
| 32 |
-
model = model.to(DEVICE).eval()
|
| 33 |
-
print(f"Depth Anything V2 ({encoder}) model loaded successfully on {DEVICE}.")
|
| 34 |
-
except FileNotFoundError:
|
| 35 |
-
print(f"Error: Checkpoint file 'checkpoints/depth_anything_v2_{encoder}.pth' not found.")
|
| 36 |
-
print("Please ensure you have downloaded the Depth Anything V2 model checkpoints")
|
| 37 |
-
print("and placed them in a 'checkpoints' folder. Refer to the setup instructions below.")
|
| 38 |
-
model = None # Set model to None to gracefully handle if it couldn't be loaded
|
| 39 |
except Exception as e:
|
| 40 |
-
print(f"An error occurred while loading the Depth Anything V2 model: {e}")
|
| 41 |
-
|
| 42 |
|
| 43 |
|
| 44 |
def process_image(image, max_disparity_ratio, inpaint_radius):
|
|
@@ -46,85 +27,95 @@ def process_image(image, max_disparity_ratio, inpaint_radius):
|
|
| 46 |
Convert a 2D photo to a stereoscopic 3D image pair using Depth Anything V2
|
| 47 |
for depth estimation and DIBR, with adjustable parameters.
|
| 48 |
"""
|
| 49 |
-
if
|
| 50 |
-
# If model failed to load, return an error image
|
| 51 |
print("Error: Depth Anything V2 model not loaded. Cannot process image.")
|
|
|
|
| 52 |
return Image.new('RGB', (200, 200), color = 'red')
|
| 53 |
|
| 54 |
-
# Convert PIL image to numpy array
|
| 55 |
image_np = np.array(image)
|
| 56 |
height, width = image_np.shape[:2]
|
| 57 |
|
| 58 |
-
#
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
if depth_map.max() - depth_map.min() > 0:
|
| 69 |
depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
|
| 70 |
else:
|
| 71 |
-
#
|
| 72 |
depth_map = np.zeros_like(depth_map)
|
| 73 |
|
| 74 |
-
# Smooth the depth map to reduce noise
|
| 75 |
depth_map = cv2.GaussianBlur(depth_map, (5, 5), 0)
|
| 76 |
|
| 77 |
# Step 2: Calculate the disparity map
|
| 78 |
-
# Disparity is
|
|
|
|
| 79 |
max_disparity_pixels = int(max_disparity_ratio * width)
|
| 80 |
-
#
|
| 81 |
-
#
|
| 82 |
disparity_map = max_disparity_pixels * (1 - depth_map)
|
| 83 |
|
| 84 |
-
# Step 3: Initialize left and right images and masks for DIBR
|
| 85 |
-
# These will
|
|
|
|
| 86 |
left_image = np.zeros_like(image_np)
|
| 87 |
right_image = np.zeros_like(image_np)
|
| 88 |
left_mask = np.ones((height, width), dtype=bool) # True means "hole"
|
| 89 |
right_mask = np.ones((height, width), dtype=bool) # True means "hole"
|
| 90 |
|
| 91 |
-
# Step 4: Perform pixel shifting (forward warping)
|
| 92 |
-
# Iterate through each pixel of the original image
|
|
|
|
| 93 |
for y in range(height):
|
| 94 |
for x in range(width):
|
| 95 |
disparity = int(disparity_map[y, x])
|
| 96 |
|
| 97 |
-
# For the left eye
|
| 98 |
new_x_left = x + disparity
|
| 99 |
-
# For the right eye
|
| 100 |
new_x_right = x - disparity
|
| 101 |
|
| 102 |
-
# Place
|
| 103 |
if 0 <= new_x_left < width:
|
| 104 |
left_image[y, new_x_left] = image_np[y, x]
|
| 105 |
-
left_mask[y, new_x_left] = False # Mark as filled
|
| 106 |
|
| 107 |
-
# Place
|
| 108 |
if 0 <= new_x_right < width:
|
| 109 |
right_image[y, new_x_right] = image_np[y, x]
|
| 110 |
-
right_mask[y, new_x_right] = False # Mark as filled
|
| 111 |
|
| 112 |
-
# Convert
|
| 113 |
left_mask_uint8 = left_mask.astype(np.uint8) * 255
|
| 114 |
right_mask_uint8 = right_mask.astype(np.uint8) * 255
|
| 115 |
|
| 116 |
-
# Step 5: Apply inpainting to fill
|
|
|
|
| 117 |
left_image_inpaint = cv2.inpaint(left_image, left_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
|
| 118 |
right_image_inpaint = cv2.inpaint(right_image, right_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
|
| 119 |
|
| 120 |
-
# Step 6: Combine the left and right
|
| 121 |
stereo_image = np.hstack((left_image_inpaint, right_image_inpaint))
|
| 122 |
|
| 123 |
-
# Convert the final
|
| 124 |
stereo_image_pil = Image.fromarray(stereo_image)
|
| 125 |
return stereo_image_pil
|
| 126 |
|
| 127 |
-
# Define the Gradio web interface layout
|
| 128 |
with gr.Blocks() as demo:
|
| 129 |
gr.Markdown(
|
| 130 |
"""
|
|
@@ -159,13 +150,15 @@ with gr.Blocks() as demo:
|
|
| 159 |
with gr.Column():
|
| 160 |
output_image = gr.Image(type="pil", label="Stereoscopic 3D Output (Side-by-Side)")
|
| 161 |
|
| 162 |
-
# Connect the button click event to the
|
|
|
|
| 163 |
process_button.click(
|
| 164 |
fn=process_image,
|
| 165 |
inputs=[input_image, max_disparity_slider, inpaint_radius_slider],
|
| 166 |
outputs=output_image
|
| 167 |
)
|
| 168 |
|
| 169 |
-
# This block is
|
|
|
|
| 170 |
if __name__ == '__main__':
|
| 171 |
demo.launch()
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import cv2
|
| 5 |
from PIL import Image
|
| 6 |
+
# Import the pipeline from transformers
|
| 7 |
+
from transformers import pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Determine the device for model inference (CUDA if available, otherwise MPS/CPU)
|
| 10 |
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
|
| 11 |
|
| 12 |
+
# Load the Depth Anything V2 model using the Hugging Face transformers pipeline
|
| 13 |
+
# We'll use the 'Small-hf' version for a balance of performance and speed,
|
| 14 |
+
# but you can change it to 'Base-hf' or 'Large-hf' if you prefer.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
try:
|
| 16 |
+
# Initialize the depth estimation pipeline
|
| 17 |
+
# device=0 targets the first GPU (if DEVICE is 'cuda'), otherwise -1 for CPU.
|
| 18 |
+
depth_estimator = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf", device=0 if DEVICE == 'cuda' else -1)
|
| 19 |
+
print(f"Depth Anything V2 (Small-hf) model loaded successfully on {DEVICE}.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
except Exception as e:
|
| 21 |
+
print(f"An error occurred while loading the Depth Anything V2 model via transformers: {e}")
|
| 22 |
+
depth_estimator = None
|
| 23 |
|
| 24 |
|
| 25 |
def process_image(image, max_disparity_ratio, inpaint_radius):
|
|
|
|
| 27 |
Convert a 2D photo to a stereoscopic 3D image pair using Depth Anything V2
|
| 28 |
for depth estimation and DIBR, with adjustable parameters.
|
| 29 |
"""
|
| 30 |
+
if depth_estimator is None:
|
| 31 |
+
# If model failed to load, return an error image or message
|
| 32 |
print("Error: Depth Anything V2 model not loaded. Cannot process image.")
|
| 33 |
+
# Create a blank red image to signal an error in the UI
|
| 34 |
return Image.new('RGB', (200, 200), color = 'red')
|
| 35 |
|
| 36 |
+
# Convert PIL image to numpy array for DIBR processing later
|
| 37 |
image_np = np.array(image)
|
| 38 |
height, width = image_np.shape[:2]
|
| 39 |
|
| 40 |
+
# Step 1: Estimate the depth map using Depth Anything V2 via transformers pipeline
|
| 41 |
+
# The pipeline directly takes a PIL Image and returns a dictionary.
|
| 42 |
+
# The 'depth' key contains the predicted depth map as a PIL Image (grayscale).
|
| 43 |
+
try:
|
| 44 |
+
depth_pil = depth_estimator(image)["depth"]
|
| 45 |
+
# Convert the PIL depth image to a numpy array for further processing
|
| 46 |
+
depth_map = np.array(depth_pil)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Error during depth estimation: {e}")
|
| 49 |
+
# Return an orange image to indicate a depth estimation specific error
|
| 50 |
+
return Image.new('RGB', (200, 200), color = 'orange')
|
| 51 |
+
|
| 52 |
+
# Normalize the depth map to [0,1]
|
| 53 |
+
# This is crucial for consistent disparity calculation regardless of raw depth values.
|
| 54 |
+
# Check to prevent division by zero if all depth values are the same.
|
| 55 |
if depth_map.max() - depth_map.min() > 0:
|
| 56 |
depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
|
| 57 |
else:
|
| 58 |
+
# If the depth map is flat, treat it as all at the same "depth"
|
| 59 |
depth_map = np.zeros_like(depth_map)
|
| 60 |
|
| 61 |
+
# Smooth the depth map to reduce noise and artifacts in the shifted images
|
| 62 |
depth_map = cv2.GaussianBlur(depth_map, (5, 5), 0)
|
| 63 |
|
| 64 |
# Step 2: Calculate the disparity map
|
| 65 |
+
# Disparity is inversely proportional to depth. Closer objects have higher disparity.
|
| 66 |
+
# max_disparity_pixels is calculated based on a ratio of the image width.
|
| 67 |
max_disparity_pixels = int(max_disparity_ratio * width)
|
| 68 |
+
# We invert the depth_map because smaller depth values typically mean closer objects
|
| 69 |
+
# and thus should have larger disparity for the stereo effect.
|
| 70 |
disparity_map = max_disparity_pixels * (1 - depth_map)
|
| 71 |
|
| 72 |
+
# Step 3: Initialize left and right images and masks for DIBR
|
| 73 |
+
# These arrays will store the pixel data for the left and right eye views.
|
| 74 |
+
# Masks track which pixels have been filled, helping identify "holes".
|
| 75 |
left_image = np.zeros_like(image_np)
|
| 76 |
right_image = np.zeros_like(image_np)
|
| 77 |
left_mask = np.ones((height, width), dtype=bool) # True means "hole"
|
| 78 |
right_mask = np.ones((height, width), dtype=bool) # True means "hole"
|
| 79 |
|
| 80 |
+
# Step 4: Perform pixel shifting based on disparity (forward warping)
|
| 81 |
+
# Iterate through each pixel of the original image and place it into the
|
| 82 |
+
# new left and right views based on its calculated disparity.
|
| 83 |
for y in range(height):
|
| 84 |
for x in range(width):
|
| 85 |
disparity = int(disparity_map[y, x])
|
| 86 |
|
| 87 |
+
# For the left eye, pixels are typically shifted to the right
|
| 88 |
new_x_left = x + disparity
|
| 89 |
+
# For the right eye, pixels are typically shifted to the left
|
| 90 |
new_x_right = x - disparity
|
| 91 |
|
| 92 |
+
# Place pixel in left image if within valid horizontal bounds
|
| 93 |
if 0 <= new_x_left < width:
|
| 94 |
left_image[y, new_x_left] = image_np[y, x]
|
| 95 |
+
left_mask[y, new_x_left] = False # Mark this target pixel as filled
|
| 96 |
|
| 97 |
+
# Place pixel in right image if within valid horizontal bounds
|
| 98 |
if 0 <= new_x_right < width:
|
| 99 |
right_image[y, new_x_right] = image_np[y, x]
|
| 100 |
+
right_mask[y, new_x_right] = False # Mark this target pixel as filled
|
| 101 |
|
| 102 |
+
# Convert masks to uint8 format (0 for filled, 255 for hole) for OpenCV's inpainting function
|
| 103 |
left_mask_uint8 = left_mask.astype(np.uint8) * 255
|
| 104 |
right_mask_uint8 = right_mask.astype(np.uint8) * 255
|
| 105 |
|
| 106 |
+
# Step 5: Apply inpainting to fill holes (disoccluded regions)
|
| 107 |
+
# The `cv2.INPAINT_TELEA` algorithm is generally effective for this task.
|
| 108 |
left_image_inpaint = cv2.inpaint(left_image, left_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
|
| 109 |
right_image_inpaint = cv2.inpaint(right_image, right_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
|
| 110 |
|
| 111 |
+
# Step 6: Combine the processed left and right images into a single side-by-side stereoscopic image
|
| 112 |
stereo_image = np.hstack((left_image_inpaint, right_image_inpaint))
|
| 113 |
|
| 114 |
+
# Convert the final numpy array back to a PIL Image, which Gradio uses for display
|
| 115 |
stereo_image_pil = Image.fromarray(stereo_image)
|
| 116 |
return stereo_image_pil
|
| 117 |
|
| 118 |
+
# Define the Gradio web interface layout and components
|
| 119 |
with gr.Blocks() as demo:
|
| 120 |
gr.Markdown(
|
| 121 |
"""
|
|
|
|
| 150 |
with gr.Column():
|
| 151 |
output_image = gr.Image(type="pil", label="Stereoscopic 3D Output (Side-by-Side)")
|
| 152 |
|
| 153 |
+
# Connect the button click event to the image processing function.
|
| 154 |
+
# When the button is clicked, `process_image` is called with the slider values.
|
| 155 |
process_button.click(
|
| 156 |
fn=process_image,
|
| 157 |
inputs=[input_image, max_disparity_slider, inpaint_radius_slider],
|
| 158 |
outputs=output_image
|
| 159 |
)
|
| 160 |
|
| 161 |
+
# This block is executed when the script is run directly (e.g., for local testing).
|
| 162 |
+
# Hugging Face Spaces typically runs the app via its own internal mechanisms.
|
| 163 |
if __name__ == '__main__':
|
| 164 |
demo.launch()
|