Spaces:

telecomadm1145
/

AIDetectV2

Running

App Files Files Community

telecomadm1145 commited on Oct 31

Commit

d66a824

verified ·

1 Parent(s): 5c5ed86

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -8

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ Swin/CAFormer/DINOv2 AI detection
 • CAFormer-V10                          : 4-class  (photo / anime × AI / Non-AI)
 • DINOv2-4class                         : 4-class  (photo / anime × AI / Non-AI)
 • DINOv2-MeanPool-Contrastive           : 4-class  (photo / anime × AI / Non-AI)
-• EmbeddingClassifier                   : 2-class  (AI vs. Non-AI)
 -------------------------------------------------------------------
 Author: telecomadm1145
 """
@@ -31,7 +31,7 @@ HF_FILENAMES = {
     "V2-Swin":                 "swin_classifier_stage1_v2_epoch_3.pth",
     "V4-Swin":                 "swin_classifier_stage1_v4.pth",
     "V9-Swin":                 "swin_classifier_4class_fp16_v9_acc9861.pth",
-    "EmbeddingClassifier": "swinv2_v3_v1.pth"
 }
 CKPT_META = {
     "V2": { "n_cls": 2, "head": "v4", "backbone": "swin_large_patch4_window12_384",
@@ -43,10 +43,10 @@ CKPT_META = {
     "V2.5-CAFormer": { "n_cls": 4, "head": "v7", "backbone": "caformer_b36.sail_in22k_ft_in1k_384",
             "names": ["non_ai", "ai", "ani_non_ai", "ani_ai"]},
     # --- MODIFIED: Added specific keys for the new loading logic ---
-    "EmbeddingClassifier": {
         "n_cls": 2,
         "head": "embedding_classifier",
-        "timm_model_name": "swinv2_base_window8_256.ms_in1k",
         "backbone_repo_id": "SmilingWolf/wd-swinv2-tagger-v3",
         "backbone_filename": "model.safetensors",
         "names": ["Non-AI Generated", "AI Generated"]
@@ -322,6 +322,34 @@ def build_transform(is_training: bool, interpolation: str):
     cfg.update(dict(interpolation=interpolation))
     return timm.data.create_transform(**cfg, is_training=is_training)
 # --------------------------------------------------
 # 6. Inference
 # --------------------------------------------------
@@ -331,19 +359,39 @@ def predict(image: Image.Image,
             interpolation: str = "bicubic"):
     if image is None: return None
     load_model(ckpt_name)
-    # Select transform based on the current model type
-    if "dinov2" in current_meta.get("model_type", ""):
         # DINOv2 specific transform
         tfm = transforms.Compose([
             transforms.Resize((224, 224)),
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
     else:
-        # Original transform logic for Swin/CAFormer/EmbeddingClassifier
         tfm = build_transform(False, interpolation)
-    inp = tfm(image).unsqueeze(0).to(device)
     # MODIFIED: For EmbeddingClassifier, the output is already probabilities, no need for softmax.
     # For others, softmax is needed.
     if current_meta["head"] == "embedding_classifier":

 • CAFormer-V10                          : 4-class  (photo / anime × AI / Non-AI)
 • DINOv2-4class                         : 4-class  (photo / anime × AI / Non-AI)
 • DINOv2-MeanPool-Contrastive           : 4-class  (photo / anime × AI / Non-AI)
+• V1-Emb                                : 2-class  (AI vs. Non-AI)
 -------------------------------------------------------------------
 Author: telecomadm1145
 """
     "V2-Swin":                 "swin_classifier_stage1_v2_epoch_3.pth",
     "V4-Swin":                 "swin_classifier_stage1_v4.pth",
     "V9-Swin":                 "swin_classifier_4class_fp16_v9_acc9861.pth",
+    "V1-Emb": "swinv2_v3_v1.pth"
 }
 CKPT_META = {
     "V2": { "n_cls": 2, "head": "v4", "backbone": "swin_large_patch4_window12_384",
     "V2.5-CAFormer": { "n_cls": 4, "head": "v7", "backbone": "caformer_b36.sail_in22k_ft_in1k_384",
             "names": ["non_ai", "ai", "ani_non_ai", "ani_ai"]},
     # --- MODIFIED: Added specific keys for the new loading logic ---
+    "V1-Emb": {
         "n_cls": 2,
         "head": "embedding_classifier",
+        "timm_model_name": "hf_hub:SmilingWolf/wd-swinv2-tagger-v3",
         "backbone_repo_id": "SmilingWolf/wd-swinv2-tagger-v3",
         "backbone_filename": "model.safetensors",
         "names": ["Non-AI Generated", "AI Generated"]
     cfg.update(dict(interpolation=interpolation))
     return timm.data.create_transform(**cfg, is_training=is_training)
+# ######################################################################
+# START: Preprocessing functions for V1-Emb model, copied from 2nd script
+# ######################################################################
+def pil_ensure_rgb(image: Image.Image) -> Image.Image:
+    # convert to RGB/RGBA if not already (deals with palette images etc.)
+    if image.mode not in ["RGB", "RGBA"]:
+        image = image.convert("RGBA") if "transparency" in image.info else image.convert("RGB")
+    # convert RGBA to RGB with white background
+    if image.mode == "RGBA":
+        canvas = Image.new("RGBA", image.size, (255, 255, 255))
+        canvas.alpha_composite(image)
+        image = canvas.convert("RGB")
+    return image
+def pil_pad_square(image: Image.Image) -> Image.Image:
+    w, h = image.size
+    # get the largest dimension so we can pad to a square
+    px = max(image.size)
+    # pad to square with white background
+    canvas = Image.new("RGB", (px, px), (255, 255, 255))
+    canvas.paste(image, ((px - w) // 2, (px - h) // 2))
+    return canvas
+# ####################################################################
+# END: Preprocessing functions for V1-Emb model
+# ####################################################################
 # --------------------------------------------------
 # 6. Inference
 # --------------------------------------------------
             interpolation: str = "bicubic"):
     if image is None: return None
     load_model(ckpt_name)
+    # ####################################################################
+    # START: MODIFIED preprocessing logic
+    # ####################################################################
+    if ckpt_name == "V1-Emb":
+        # Specific preprocessing for the V1-Emb model based on the tagger script
+        # 1. Ensure RGB and pad to a square to prevent distortion
+        processed_image = pil_ensure_rgb(image)
+        processed_image = pil_pad_square(processed_image)
+        # 2. Apply standard timm transforms (resize, tensor, normalize)
+        tfm = build_transform(False, interpolation)
+        inp = tfm(processed_image).unsqueeze(0).to(device)
+        # 3. Convert from RGB to BGR as required by the original model
+        inp = inp[:, [2, 1, 0]]
+    elif "dinov2" in current_meta.get("model_type", ""):
         # DINOv2 specific transform
         tfm = transforms.Compose([
             transforms.Resize((224, 224)),
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
+        inp = tfm(image).unsqueeze(0).to(device)
     else:
+        # Original transform logic for Swin/CAFormer
         tfm = build_transform(False, interpolation)
+        inp = tfm(image).unsqueeze(0).to(device)
+    # ####################################################################
+    # END: MODIFIED preprocessing logic
+    # ####################################################################
     # MODIFIED: For EmbeddingClassifier, the output is already probabilities, no need for softmax.
     # For others, softmax is needed.
     if current_meta["head"] == "embedding_classifier":