Spaces:

onuralpszr
/

paligemma2-detection

Running on Zero

App Files Files Community

onuralpszr commited on Dec 6, 2024

Commit

49f86c8

verified ·

1 Parent(s): 47768b2

fix: 🐛 single input for detection and handle class names

Browse files

Signed-off-by: Onuralp SEZER <thunderbirdtr@gmail.com>

Files changed (1) hide show

app.py +36 -15

app.py CHANGED Viewed

@@ -54,6 +54,12 @@ model_id = "google/paligemma2-3b-pt-448"
 model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DEVICE)
 processor = PaliGemmaProcessor.from_pretrained(model_id)
 @spaces.GPU
 def paligemma_detection(input_image, input_text, max_new_tokens):
     model_inputs = processor(text=input_text,
@@ -70,13 +76,17 @@ def paligemma_detection(input_image, input_text, max_new_tokens):
-def annotate_image(result, resolution_wh, class_names, cv_image):
     detections = sv.Detections.from_lmm(
-    sv.LMM.PALIGEMMA,
-    result,
-    resolution_wh=resolution_wh,
-    classes=class_names.split(',')
     )
     annotated_image = BOX_ANNOTATOR.annotate(
@@ -98,17 +108,17 @@ def annotate_image(result, resolution_wh, class_names, cv_image):
     return annotated_image
-def process_image(input_image, input_text, class_names, max_new_tokens):
     cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
     result = paligemma_detection(input_image, input_text, max_new_tokens)
     annotated_image = annotate_image(result,
                                      (input_image.width, input_image.height),
-                                     class_names, cv_image)
     return annotated_image, result
 @spaces.GPU
-def process_video(input_video, input_text, class_names, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
     if not input_video:
         gr.Info("Please upload a video.")
         return None
@@ -117,6 +127,11 @@ def process_video(input_video, input_text, class_names, max_new_tokens, progress
         gr.Info("Please enter a text prompt.")
         return None
     name = generate_unique_name()
     frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
     create_directory(frame_directory_path)
@@ -146,7 +161,7 @@ def process_video(input_video, input_text, class_names, max_new_tokens, progress
                 sv.LMM.PALIGEMMA,
                 result,
                 resolution_wh=(video_info.width, video_info.height),
-                classes=class_names.split(',')
             )
             annotated_frame = BOX_ANNOTATOR.annotate(
@@ -177,15 +192,18 @@ with gr.Blocks() as app:
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(type="pil", label="Input Image")
-                input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
-                class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
                 max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=10, label="Max New Tokens", info="Set to larger for longer generation.")
             with gr.Column():
                 annotated_image = gr.Image(type="pil", label="Annotated Image")
                 detection_result = gr.Textbox(label="Detection Result")
         gr.Button("Submit").click(
             fn=process_image,
-            inputs=[input_image, input_text, class_names, max_new_tokens],
             outputs=[annotated_image, detection_result]
         )
@@ -193,8 +211,11 @@ with gr.Blocks() as app:
         with gr.Row():
             with gr.Column():
                 input_video = gr.Video(label="Input Video")
-                input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog")
-                class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names")
                 max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=1, label="Max New Tokens", info="Set to larger for longer generation.")
             with gr.Column():
                 output_video = gr.Video(label="Annotated Video")
@@ -202,7 +223,7 @@ with gr.Blocks() as app:
         gr.Button("Process Video").click(
             fn=process_video,
-            inputs=[input_video, input_text, class_names, max_new_tokens],
             outputs=[output_video, detection_result]
         )

 model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DEVICE)
 processor = PaliGemmaProcessor.from_pretrained(model_id)
+def parse_class_names(prompt):
+    if not prompt.lower().startswith('detect '):
+        return []
+    classes_text = prompt[7:].strip()
+    return [cls.strip() for cls in classes_text.split(';') if cls.strip()]
 @spaces.GPU
 def paligemma_detection(input_image, input_text, max_new_tokens):
     model_inputs = processor(text=input_text,
+def annotate_image(result, resolution_wh, prompt, cv_image):
+    class_names = parse_class_names(prompt)
+    if not class_names:
+        gr.Warning("Invalid prompt format. Please use 'detect class1;class2;class3' format")
+        return cv_image
     detections = sv.Detections.from_lmm(
+        sv.LMM.PALIGEMMA,
+        result,
+        resolution_wh=resolution_wh,
+        classes=class_names
     )
     annotated_image = BOX_ANNOTATOR.annotate(
     return annotated_image
+def process_image(input_image, input_text, max_new_tokens):
     cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
     result = paligemma_detection(input_image, input_text, max_new_tokens)
     annotated_image = annotate_image(result,
                                      (input_image.width, input_image.height),
+                                     input_text, cv_image)
     return annotated_image, result
 @spaces.GPU
+def process_video(input_video, input_text, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
     if not input_video:
         gr.Info("Please upload a video.")
         return None
         gr.Info("Please enter a text prompt.")
         return None
+    class_names = parse_class_names(input_text)
+    if not class_names:
+        gr.Warning("Invalid prompt format. Please use 'detect class1;class2;class3' format")
+        return None, None
     name = generate_unique_name()
     frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
     create_directory(frame_directory_path)
                 sv.LMM.PALIGEMMA,
                 result,
                 resolution_wh=(video_info.width, video_info.height),
+                classes=class_names
             )
             annotated_frame = BOX_ANNOTATOR.annotate(
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(type="pil", label="Input Image")
+                input_text = gr.Textbox(
+                    lines=2,
+                    placeholder="Enter prompt in format like this: detect person;dog;building",
+                    label="Enter detection prompt"
+                )
                 max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=10, label="Max New Tokens", info="Set to larger for longer generation.")
             with gr.Column():
                 annotated_image = gr.Image(type="pil", label="Annotated Image")
                 detection_result = gr.Textbox(label="Detection Result")
         gr.Button("Submit").click(
             fn=process_image,
+            inputs=[input_image, input_text, max_new_tokens],
             outputs=[annotated_image, detection_result]
         )
         with gr.Row():
             with gr.Column():
                 input_video = gr.Video(label="Input Video")
+                input_text = gr.Textbox(
+                    lines=2,
+                    placeholder="Enter prompt in format like this: detect person;dog;building",
+                    label="Enter detection prompt"
+                )
                 max_new_tokens = gr.Slider(minimum=20, maximum=200, value=100, step=1, label="Max New Tokens", info="Set to larger for longer generation.")
             with gr.Column():
                 output_video = gr.Video(label="Annotated Video")
         gr.Button("Process Video").click(
             fn=process_video,
+            inputs=[input_video, input_text, max_new_tokens],
             outputs=[output_video, detection_result]
         )