Spaces:

SkalskiP
/

segment-anything-model-2

Running on Zero

App Files Files Community

segment-anything-model-2 / app.py

SkalskiP

working on video inference

aabd771 over 1 year ago

raw

history blame

9.48 kB

	import os
	from typing import Optional

	import cv2
	import gradio as gr
	import numpy as np
	import supervision as sv
	import torch
	from PIL import Image
	from tqdm import tqdm
	from gradio_image_prompter import ImagePrompter

	from utils.models import load_models, CHECKPOINT_NAMES, MODE_NAMES, \
	MASK_GENERATION_MODE, BOX_PROMPT_MODE, VIDEO_SEGMENTATION_MODE
	from utils.video import create_directory, generate_unique_name
	from sam2.build_sam import build_sam2_video_predictor

	MARKDOWN = """
	# Segment Anything Model 2 🔥
	<div>
	<a href="https://github.com/facebookresearch/segment-anything-2">
	<img src="https://badges.aleen42.com/src/github.svg" alt="GitHub" style="display:inline-block;">
	</a>
	<a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-segment-images-with-sam-2.ipynb">
	<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
	</a>
	<a href="https://blog.roboflow.com/what-is-segment-anything-2/">
	<img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
	</a>
	<a href="https://www.youtube.com/watch?v=Dv003fTyO-Y">
	<img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
	</a>
	</div>

	Segment Anything Model 2 (SAM 2) is a foundation model designed to address promptable
	visual segmentation in both images and videos. **Video segmentation will be available
	soon.**
	"""

	EXAMPLES = [
	["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
	["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
	["tiny", MASK_GENERATION_MODE, "https://media.roboflow.com/notebooks/examples/dog-4.jpeg", None],
	]

	DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)
	IMAGE_PREDICTORS, MASK_GENERATORS = load_models(device=DEVICE)

	SCALE_FACTOR = 0.5
	TARGET_DIRECTORY = "tmp"
	# creating video results directory
	create_directory(directory_path=TARGET_DIRECTORY)


	def on_mode_dropdown_change(text):
	return [
	gr.Image(visible=text == MASK_GENERATION_MODE),
	ImagePrompter(visible=text == BOX_PROMPT_MODE),
	gr.Video(visible=text == VIDEO_SEGMENTATION_MODE),
	ImagePrompter(visible=text == VIDEO_SEGMENTATION_MODE),
	gr.Button(visible=text != VIDEO_SEGMENTATION_MODE),
	gr.Button(visible=text == VIDEO_SEGMENTATION_MODE),
	gr.Image(visible=text != VIDEO_SEGMENTATION_MODE),
	gr.Video(visible=text == VIDEO_SEGMENTATION_MODE)
	]


	def on_video_input_change(video_input):
	if not video_input:
	return None
	frames_generator = sv.get_video_frames_generator(video_input)
	frame = next(frames_generator)
	frame = sv.scale_image(frame, SCALE_FACTOR)
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frame = Image.fromarray(frame)
	return {'image': frame, 'points': []}


	def process_image(
	checkpoint_dropdown,
	mode_dropdown,
	image_input,
	image_prompter_input
	) -> Optional[Image.Image]:
	if mode_dropdown == BOX_PROMPT_MODE:
	image_input = image_prompter_input["image"]
	prompt = image_prompter_input["points"]
	if len(prompt) == 0:
	return image_input

	model = IMAGE_PREDICTORS[checkpoint_dropdown]
	image = np.array(image_input.convert("RGB"))
	box = np.array([[x1, y1, x2, y2] for x1, y1, _, x2, y2, _ in prompt])

	model.set_image(image)
	masks, _, _ = model.predict(box=box, multimask_output=False)

	# dirty fix; remove this later
	if len(masks.shape) == 4:
	masks = np.squeeze(masks)

	detections = sv.Detections(
	xyxy=sv.mask_to_xyxy(masks=masks),
	mask=masks.astype(bool)
	)
	return MASK_ANNOTATOR.annotate(image_input, detections)

	if mode_dropdown == MASK_GENERATION_MODE:
	model = MASK_GENERATORS[checkpoint_dropdown]
	image = np.array(image_input.convert("RGB"))
	result = model.generate(image)
	detections = sv.Detections.from_sam(result)
	return MASK_ANNOTATOR.annotate(image_input, detections)


	def process_video(
	checkpoint_dropdown,
	mode_dropdown,
	video_input,
	video_prompter_input,
	progress=gr.Progress(track_tqdm=True)
	) -> str:
	if mode_dropdown != VIDEO_SEGMENTATION_MODE:
	return str(video_input)

	name = generate_unique_name()
	frame_directory_path = os.path.join(TARGET_DIRECTORY, name)
	frames_sink = sv.ImageSink(
	target_dir_path=frame_directory_path,
	image_name_pattern="{:05d}.jpeg"
	)

	video_info = sv.VideoInfo.from_video_path(video_input)
	frames_generator = sv.get_video_frames_generator(video_input)
	with frames_sink:
	for frame in tqdm(
	frames_generator,
	total=video_info.total_frames,
	desc="splitting video into frames"
	):
	frame = sv.scale_image(frame, SCALE_FACTOR)
	frames_sink.save_image(frame)

	model = build_sam2_video_predictor(
	"sam2_hiera_t.yaml",
	"checkpoints/sam2_hiera_tiny.pt",
	device=DEVICE
	)
	inference_state = model.init_state(
	video_path=frame_directory_path,
	offload_video_to_cpu=DEVICE == torch.device('cpu'),
	offload_state_to_cpu=DEVICE == torch.device('cpu'),
	)

	prompt = video_prompter_input["points"]
	points = np.array([[x1, y1] for x1, y1, _, _, _, _ in prompt])
	labels = np.ones(len(points))

	_, object_ids, mask_logits = model.add_new_points(
	inference_state=inference_state,
	frame_idx=0,
	obj_id=1,
	points=points,
	labels=labels,
	)

	del inference_state
	del model

	video_path = os.path.join(TARGET_DIRECTORY, f"{name}.mp4")
	return str(video_input)


	with gr.Blocks() as demo:
	gr.Markdown(MARKDOWN)
	with gr.Row():
	checkpoint_dropdown_component = gr.Dropdown(
	choices=CHECKPOINT_NAMES,
	value=CHECKPOINT_NAMES[0],
	label="Checkpoint", info="Select a SAM2 checkpoint to use.",
	interactive=True
	)
	mode_dropdown_component = gr.Dropdown(
	choices=MODE_NAMES,
	value=MODE_NAMES[0],
	label="Mode",
	info="Select a mode to use. `box prompt` if you want to generate masks for "
	"selected objects, `mask generation` if you want to generate masks "
	"for the whole image, and `video segmentation` if you want to track "
	"object on video.",
	interactive=True
	)
	with gr.Row():
	with gr.Column():
	image_input_component = gr.Image(
	type='pil', label='Upload image', visible=False)
	image_prompter_input_component = ImagePrompter(
	type='pil', label='Prompt image')
	video_input_component = gr.Video(
	label='Step 1: Upload video', visible=False)
	video_prompter_input_component = ImagePrompter(
	type='pil', label='Step 2: Prompt frame', visible=False)
	submit_image_button_component = gr.Button(
	value='Submit', variant='primary')
	submit_video_button_component = gr.Button(
	value='Submit', variant='primary', visible=False)
	with gr.Column():
	image_output_component = gr.Image(type='pil', label='Image output')
	video_output_component = gr.Video(
	label='Step 2: Video output', visible=False)
	with gr.Row():
	gr.Examples(
	fn=process_image,
	examples=EXAMPLES,
	inputs=[
	checkpoint_dropdown_component,
	mode_dropdown_component,
	image_input_component,
	image_prompter_input_component,
	],
	outputs=[image_output_component],
	run_on_click=True
	)

	mode_dropdown_component.change(
	on_mode_dropdown_change,
	inputs=[mode_dropdown_component],
	outputs=[
	image_input_component,
	image_prompter_input_component,
	video_input_component,
	video_prompter_input_component,
	submit_image_button_component,
	submit_video_button_component,
	image_output_component,
	video_output_component
	]
	)
	video_input_component.change(
	fn=on_video_input_change,
	inputs=[video_input_component],
	outputs=[video_prompter_input_component]
	)
	submit_image_button_component.click(
	fn=process_image,
	inputs=[
	checkpoint_dropdown_component,
	mode_dropdown_component,
	image_input_component,
	image_prompter_input_component,
	],
	outputs=[image_output_component]
	)
	submit_video_button_component.click(
	fn=process_video,
	inputs=[
	checkpoint_dropdown_component,
	mode_dropdown_component,
	video_input_component,
	video_prompter_input_component,
	],
	outputs=[video_output_component]
	)

	demo.launch(debug=False, show_error=True, max_threads=1)