Spaces:

telecomadm1145
/

AIDetectV2

Running

App Files Files Community

telecomadm1145 commited on 4 days ago

Commit

94778bf

verified ·

1 Parent(s): 8cbab40

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -183

app.py CHANGED Viewed

@@ -1,16 +1,11 @@
 # -*- coding: utf-8 -*-
 """
-Swin/CAFormer/DINOv2 AI detection
 -------------------------------------------------------------------
 • Swin-V2 / V4                          : 2-class  (AI vs. Non-AI)
 • Swin-V7 / V8 / V9                     : 4-class  (photo / anime × AI / Non-AI)
-• CAFormer-V10                          : 4-class  (photo / anime × AI / Non-AI)
-• DINOv2-4class                         : 4-class  (photo / anime × AI / Non-AI)
-• DINOv2-MeanPool-Contrastive           : 4-class  (photo / anime × AI / Non-AI)
-• V1-Emb                                : 2-class  (AI vs. Non-AI)
-• V2-Emb                                : 2-class  (AI vs. Non-AI)
 -------------------------------------------------------------------
-Author: telecomadm1145
 """
 import os, torch, timm, numpy as np
 import torch.nn as nn
@@ -18,14 +13,11 @@ import torch.nn.functional as F
 from PIL import Image
 import gradio as gr
 from huggingface_hub import hf_hub_download
-from safetensors.torch import load_file  # Added for .safetensors support
-# Added for DINOv2 model
 from transformers import AutoModel
 from torchvision import transforms
-# --------------------------------------------------
-# 1. Model & Checkpoint Meta-data
-# --------------------------------------------------
 REPO_ID = "telecomadm1145/swin-ai-detection"
 HF_FILENAMES = {
     "V2.5-CAFormer":      "caformer_b36_4class_96.safetensors",
@@ -56,7 +48,7 @@ DEFAULT_CKPT = "V3-Emb"
 LOCAL_CKPT_DIR = "./checkpoints"
 SEED = 4421
 DROP_RATE = 0.1
-DROPOUT_RATE = 0.1 # From train.py for DINOv2
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch.manual_seed(SEED);  np.random.seed(SEED)
 print(f"Using device: {device}")
@@ -81,7 +73,6 @@ class EmbeddingClassifier(nn.Module):
     def forward(self, x):
         return self.net(x)
-# MODIFIED: Changed __init__ to accept timm_model_name and use pretrained=False
 class EmbeddingClassifierModel(nn.Module):
     def __init__(self, timm_model_name, num_classes):
         super().__init__()
@@ -91,96 +82,10 @@ class EmbeddingClassifierModel(nn.Module):
     def forward(self, x):
         features = self.backbone(x)
-        # The classifier returns a single value (probability of being Non-AI)
         prob_class0 = self.classifier(features)
-        # To maintain compatibility with the `predict` function which expects multi-class outputs
-        # and applies softmax, we construct a 2-class output.
-        # prob_class1 is simply 1 - prob_class0
         prob_class1 = 1 - prob_class0
-        # The final output is for ["Non-AI", "AI"], i.e., [prob_class0, prob_class1].
-        # The softmax in predict() will be applied to this, so we should return logits.
-        # However, since the original output is a sigmoid, we can work with probabilities
-        # and just return them directly. The gr.Label will normalize this.
-        # A simpler way is to construct logits that would result in these probabilities.
-        # Let's stick to the original logic's output format.
         return torch.cat([prob_class0, prob_class1], dim=1)
-# --- Original DINOv2 Classifier (Weighted Attention Pooling) ---
-class DINOv2Classifier_WeightedPool(nn.Module):
-    def __init__(self, model_name, num_classes):
-        super().__init__()
-        self.backbone = AutoModel.from_pretrained(model_name)
-        self.weight_self_attention = nn.MultiheadAttention(
-            embed_dim=self.backbone.config.hidden_size,
-            num_heads=self.backbone.config.num_attention_heads,
-            dropout=self.backbone.config.hidden_dropout_prob,
-            batch_first=True
-        )
-        self.weight_mlp = nn.Sequential(
-            nn.Linear(self.backbone.config.hidden_size, self.backbone.config.hidden_size * 4),
-            nn.LayerNorm(self.backbone.config.hidden_size * 4),
-            nn.GELU(),
-            nn.Linear(self.backbone.config.hidden_size * 4, 1)
-        )
-        self.classifier = nn.Sequential(
-            nn.Dropout(DROPOUT_RATE),
-            nn.Linear(self.backbone.config.hidden_size, self.backbone.config.hidden_size),
-            nn.LayerNorm(self.backbone.config.hidden_size),
-            nn.GELU(),
-            nn.Dropout(DROPOUT_RATE),
-            nn.Linear(self.backbone.config.hidden_size, num_classes)
-        )
-        nn.init.xavier_uniform_(self.weight_self_attention.in_proj_weight)
-        nn.init.xavier_uniform_(self.weight_self_attention.out_proj.weight)
-        nn.init.constant_(self.weight_self_attention.out_proj.bias, 0)
-        for module in [self.weight_mlp, self.classifier]:
-            if isinstance(module, nn.Linear):
-                nn.init.xavier_uniform_(module.weight)
-                nn.init.constant_(module.bias, 0)
-    def forward(self, x):
-        outputs = self.backbone(x)
-        attn_output, _ = self.weight_self_attention(
-            outputs.last_hidden_state,
-            outputs.last_hidden_state,
-            outputs.last_hidden_state,
-        )
-        raw_weights = self.weight_mlp(attn_output)
-        raw_weights = raw_weights.squeeze(-1)
-        pooling_weights = torch.softmax(raw_weights, dim=-1)
-        pooled_output = torch.sum(outputs.last_hidden_state * pooling_weights.unsqueeze(-1), dim=1)
-        return self.classifier(pooled_output)
-# --- New DINOv2 Classifier (Mean Pooling) ---
-class DINOv2Classifier_MeanPool(nn.Module):
-    def __init__(self, model_name, num_classes):
-        super().__init__()
-        self.backbone = AutoModel.from_pretrained(model_name)
-        self.classifier = nn.Sequential(
-            nn.Dropout(DROPOUT_RATE),
-            nn.Linear(self.backbone.config.hidden_size, self.backbone.config.hidden_size),
-            nn.LayerNorm(self.backbone.config.hidden_size),
-            nn.GELU(),
-            nn.Dropout(DROPOUT_RATE),
-            nn.Linear(self.backbone.config.hidden_size, num_classes)
-        )
-        for module in self.classifier:
-            if isinstance(module, nn.Linear):
-                nn.init.xavier_uniform_(module.weight)
-                nn.init.constant_(module.bias, 0)
-    def forward(self, x, return_features=False):
-        outputs = self.backbone(x)
-        pooled_output = outputs.last_hidden_state.mean(dim=1)
-        if return_features:
-            return pooled_output
-        return self.classifier(pooled_output)
-# --- SwinClassifier ---
 class SwinClassifier(nn.Module):
     def __init__(self, model_name, num_classes, pretrained=True,
                  head_version="v4"):
@@ -189,7 +94,6 @@ class SwinClassifier(nn.Module):
             model_name, pretrained=pretrained, num_classes=0
         )
         self.data_config = timm.data.resolve_data_config({}, model=self.backbone)
-        # ------- 根据版本选择不同 head -------
         if head_version == "v7":            # <-- V7, V8, V9, V10: 极简 64-hidden, GELU
             self.classifier = nn.Sequential(
                 nn.Dropout(DROP_RATE),
@@ -228,11 +132,7 @@ class SwinClassifier(nn.Module):
     def forward(self, x):
         return self.classifier(self.backbone(x))
-# --------------------------------------------------
-# 4. 动态加载模型
-# --------------------------------------------------
 def load_model(ckpt_name: str):
-    """Load model only when `ckpt_name` changes."""
     global model, current_ckpt, current_meta
     if ckpt_name == current_ckpt and model is not None:
         return
@@ -240,17 +140,14 @@ def load_model(ckpt_name: str):
     meta = CKPT_META[ckpt_name]
     ckpt_filename = HF_FILENAMES[ckpt_name]
-    # --- MODIFIED: Special handling for EmbeddingClassifier ---
     head_version = meta.get("head", "v4")
     if head_version == "embedding_classifier":
-        # 1. Create the model structure with a non-pretrained backbone
         print(f"Creating backbone: {meta['timm_model_name']}")
         model = EmbeddingClassifierModel(
             timm_model_name=meta["timm_model_name"],
             num_classes=meta["n_cls"]
         ).to(device)
-        # 2. Download and load backbone weights from SmilingWolf's repo
         print(f"Loading backbone weights from {meta['backbone_repo_id']}...")
         backbone_ckpt_file = hf_hub_download(
             repo_id=meta["backbone_repo_id"],
@@ -261,11 +158,10 @@ def load_model(ckpt_name: str):
         model.backbone.load_state_dict(backbone_state,strict=False)
         print("✅ Backbone weights loaded.")
-        # 3. Download and load classifier (head) weights from the main repo
         print(f"Loading classifier head weights from {REPO_ID}...")
         classifier_ckpt_file = hf_hub_download(
             repo_id=REPO_ID,
-            filename=ckpt_filename, # This is 'swinv2_v3_v1.pth'
             local_dir=LOCAL_CKPT_DIR, force_download=False
         )
         classifier_state = torch.load(classifier_ckpt_file, map_location=device, weights_only=False)
@@ -273,7 +169,6 @@ def load_model(ckpt_name: str):
         print("✅ Classifier head weights loaded.")
     else:
-        # --- Original logic for all other models ---
         ckpt_file = hf_hub_download(
             repo_id=REPO_ID,
             filename=ckpt_filename,
@@ -281,27 +176,13 @@ def load_model(ckpt_name: str):
         )
         print(f"Checkpoint: {ckpt_file}")
-        # Build model structure based on model_type or head
-        model_type = meta.get("model_type")
-        if model_type == "dinov2_weighted_pool":
-            model = DINOv2Classifier_WeightedPool(
-                model_name=meta["backbone"],
-                num_classes=meta["n_cls"]
-            ).to(device)
-        elif model_type == "dinov2_mean_pool":
-            model = DINOv2Classifier_MeanPool(
-                model_name=meta["backbone"],
-                num_classes=meta["n_cls"]
-            ).to(device)
-        else: # Existing logic for Swin/CAFormer
-            model = SwinClassifier(
-                meta["backbone"],
-                num_classes=meta["n_cls"],
-                pretrained=False,
-                head_version=head_version
-            ).to(device)
-        # Compatible load for .pth and .safetensors
         if ckpt_filename.endswith(".safetensors"):
             state = load_file(ckpt_file, device=device)
         else:
@@ -313,23 +194,15 @@ def load_model(ckpt_name: str):
     current_ckpt, current_meta = ckpt_name, meta
     print(f"✅ {ckpt_name} loaded (classes = {meta['n_cls']}).")
-# --------------------------------------------------
-# 5. Transform 工厂
-# --------------------------------------------------
 def build_transform(is_training: bool, interpolation: str):
     if model is None: raise RuntimeError("Model not loaded yet.")
     cfg = model.data_config.copy()
     cfg.update(dict(interpolation=interpolation))
     return timm.data.create_transform(**cfg, is_training=is_training)
-# ######################################################################
-# START: Preprocessing functions for V1-Emb model, copied from 2nd script
-# ######################################################################
 def pil_ensure_rgb(image: Image.Image) -> Image.Image:
-    # convert to RGB/RGBA if not already (deals with palette images etc.)
     if image.mode not in ["RGB", "RGBA"]:
         image = image.convert("RGBA") if "transparency" in image.info else image.convert("RGB")
-    # convert RGBA to RGB with white background
     if image.mode == "RGBA":
         canvas = Image.new("RGBA", image.size, (255, 255, 255))
         canvas.alpha_composite(image)
@@ -339,20 +212,11 @@ def pil_ensure_rgb(image: Image.Image) -> Image.Image:
 def pil_pad_square(image: Image.Image) -> Image.Image:
     w, h = image.size
-    # get the largest dimension so we can pad to a square
     px = max(image.size)
-    # pad to square with white background
     canvas = Image.new("RGB", (px, px), (255, 255, 255))
     canvas.paste(image, ((px - w) // 2, (px - h) // 2))
     return canvas
-# ####################################################################
-# END: Preprocessing functions for V1-Emb model
-# ####################################################################
-# --------------------------------------------------
-# 6. Inference
-# --------------------------------------------------
 @torch.no_grad()
 def predict(image: Image.Image,
             ckpt_name: str,
@@ -360,60 +224,34 @@ def predict(image: Image.Image,
     if image is None: return None
     load_model(ckpt_name)
-    # ####################################################################
-    # START: MODIFIED preprocessing logic
-    # ####################################################################
     if "Emb" in ckpt_name:
-        # Specific preprocessing for the V1-Emb model based on the tagger script
-        # 1. Ensure RGB and pad to a square to prevent distortion
         processed_image = pil_ensure_rgb(image)
         processed_image = pil_pad_square(processed_image)
-        # 2. Apply standard timm transforms (resize, tensor, normalize)
         tfm = build_transform(False, interpolation)
         inp = tfm(processed_image).unsqueeze(0).to(device)
-        # 3. Convert from RGB to BGR as required by the original model
         inp = inp[:, [2, 1, 0]]
-    elif "dinov2" in current_meta.get("model_type", ""):
-        # DINOv2 specific transform
-        tfm = transforms.Compose([
-            transforms.Resize((224, 224)),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-        ])
-        inp = tfm(image).unsqueeze(0).to(device)
     else:
-        # Original transform logic for Swin/CAFormer
         tfm = build_transform(False, interpolation)
         inp = tfm(image).unsqueeze(0).to(device)
-    # ####################################################################
-    # END: MODIFIED preprocessing logic
-    # ####################################################################
-    # MODIFIED: For EmbeddingClassifier, the output is already probabilities, no need for softmax.
-    # For others, softmax is needed.
     if current_meta["head"] == "embedding_classifier":
         probs = model(inp)[0].cpu()
     else:
         probs = F.softmax(model(inp), dim=1)[0].cpu()
     class_names = current_meta["names"]
-    # 保证 gr.Label 在 2 / 4 类都能正常显示
     return {class_names[i]: float(probs[i])
             for i in range(len(class_names))}
-# --------------------------------------------------
-# 7. Gradio UI
-# --------------------------------------------------
 def launch():
-    load_model(DEFAULT_CKPT)      # 预加载
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("# AI Detector")
         gr.Markdown(
             "Choose a model checkpoint on the left, upload an image, "
-            "and click **Run** to see predictions. V2-Emb produces the best results."
         )
         with gr.Row():
             with gr.Column(scale=1):
@@ -429,17 +267,14 @@ def launch():
                 in_img  = gr.Image(type="pil", label="Upload Image")
             with gr.Column(scale=1):
-                # num_top_classes 设为 4，兼容 2-class / 4-class
                 out_lbl = gr.Label(num_top_classes=4, label="Predictions")
         run_btn.click(
             predict,
             inputs=[in_img, sel_ckpt, sel_interp],
             outputs=[out_lbl]
         )
-        # optional example folder
         if not os.path.exists("examples"):
             os.makedirs("examples")
-            print("Put some jpg/png files inside ./examples for demo examples")
         example_files = [os.path.join("examples", f)
                          for f in os.listdir("examples")
                          if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
@@ -453,6 +288,5 @@ def launch():
             )
     demo.launch()
-# --------------------------------------------------
 if __name__ == "__main__":
     launch()

 # -*- coding: utf-8 -*-
 """
 -------------------------------------------------------------------
 • Swin-V2 / V4                          : 2-class  (AI vs. Non-AI)
 • Swin-V7 / V8 / V9                     : 4-class  (photo / anime × AI / Non-AI)
+• CAFormer-V2.5                         : 4-class  (photo / anime × AI / Non-AI)
+• V3-Emb                                : 2-class  (AI vs. Non-AI)
 -------------------------------------------------------------------
 """
 import os, torch, timm, numpy as np
 import torch.nn as nn
 from PIL import Image
 import gradio as gr
 from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
 from transformers import AutoModel
 from torchvision import transforms
 REPO_ID = "telecomadm1145/swin-ai-detection"
 HF_FILENAMES = {
     "V2.5-CAFormer":      "caformer_b36_4class_96.safetensors",
 LOCAL_CKPT_DIR = "./checkpoints"
 SEED = 4421
 DROP_RATE = 0.1
+DROPOUT_RATE = 0.1
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch.manual_seed(SEED);  np.random.seed(SEED)
 print(f"Using device: {device}")
     def forward(self, x):
         return self.net(x)
 class EmbeddingClassifierModel(nn.Module):
     def __init__(self, timm_model_name, num_classes):
         super().__init__()
     def forward(self, x):
         features = self.backbone(x)
         prob_class0 = self.classifier(features)
         prob_class1 = 1 - prob_class0
         return torch.cat([prob_class0, prob_class1], dim=1)
 class SwinClassifier(nn.Module):
     def __init__(self, model_name, num_classes, pretrained=True,
                  head_version="v4"):
             model_name, pretrained=pretrained, num_classes=0
         )
         self.data_config = timm.data.resolve_data_config({}, model=self.backbone)
         if head_version == "v7":            # <-- V7, V8, V9, V10: 极简 64-hidden, GELU
             self.classifier = nn.Sequential(
                 nn.Dropout(DROP_RATE),
     def forward(self, x):
         return self.classifier(self.backbone(x))
 def load_model(ckpt_name: str):
     global model, current_ckpt, current_meta
     if ckpt_name == current_ckpt and model is not None:
         return
     meta = CKPT_META[ckpt_name]
     ckpt_filename = HF_FILENAMES[ckpt_name]
     head_version = meta.get("head", "v4")
     if head_version == "embedding_classifier":
         print(f"Creating backbone: {meta['timm_model_name']}")
         model = EmbeddingClassifierModel(
             timm_model_name=meta["timm_model_name"],
             num_classes=meta["n_cls"]
         ).to(device)
         print(f"Loading backbone weights from {meta['backbone_repo_id']}...")
         backbone_ckpt_file = hf_hub_download(
             repo_id=meta["backbone_repo_id"],
         model.backbone.load_state_dict(backbone_state,strict=False)
         print("✅ Backbone weights loaded.")
         print(f"Loading classifier head weights from {REPO_ID}...")
         classifier_ckpt_file = hf_hub_download(
             repo_id=REPO_ID,
+            filename=ckpt_filename,
             local_dir=LOCAL_CKPT_DIR, force_download=False
         )
         classifier_state = torch.load(classifier_ckpt_file, map_location=device, weights_only=False)
         print("✅ Classifier head weights loaded.")
     else:
         ckpt_file = hf_hub_download(
             repo_id=REPO_ID,
             filename=ckpt_filename,
         )
         print(f"Checkpoint: {ckpt_file}")
+        model = SwinClassifier(
+            meta["backbone"],
+            num_classes=meta["n_cls"],
+            pretrained=False,
+            head_version=head_version
+        ).to(device)
         if ckpt_filename.endswith(".safetensors"):
             state = load_file(ckpt_file, device=device)
         else:
     current_ckpt, current_meta = ckpt_name, meta
     print(f"✅ {ckpt_name} loaded (classes = {meta['n_cls']}).")
 def build_transform(is_training: bool, interpolation: str):
     if model is None: raise RuntimeError("Model not loaded yet.")
     cfg = model.data_config.copy()
     cfg.update(dict(interpolation=interpolation))
     return timm.data.create_transform(**cfg, is_training=is_training)
 def pil_ensure_rgb(image: Image.Image) -> Image.Image:
     if image.mode not in ["RGB", "RGBA"]:
         image = image.convert("RGBA") if "transparency" in image.info else image.convert("RGB")
     if image.mode == "RGBA":
         canvas = Image.new("RGBA", image.size, (255, 255, 255))
         canvas.alpha_composite(image)
 def pil_pad_square(image: Image.Image) -> Image.Image:
     w, h = image.size
     px = max(image.size)
     canvas = Image.new("RGB", (px, px), (255, 255, 255))
     canvas.paste(image, ((px - w) // 2, (px - h) // 2))
     return canvas
 @torch.no_grad()
 def predict(image: Image.Image,
             ckpt_name: str,
     if image is None: return None
     load_model(ckpt_name)
     if "Emb" in ckpt_name:
         processed_image = pil_ensure_rgb(image)
         processed_image = pil_pad_square(processed_image)
         tfm = build_transform(False, interpolation)
         inp = tfm(processed_image).unsqueeze(0).to(device)
         inp = inp[:, [2, 1, 0]]
     else:
         tfm = build_transform(False, interpolation)
         inp = tfm(image).unsqueeze(0).to(device)
     if current_meta["head"] == "embedding_classifier":
         probs = model(inp)[0].cpu()
     else:
         probs = F.softmax(model(inp), dim=1)[0].cpu()
     class_names = current_meta["names"]
     return {class_names[i]: float(probs[i])
             for i in range(len(class_names))}
 def launch():
+    load_model(DEFAULT_CKPT)
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("# AI Detector")
         gr.Markdown(
             "Choose a model checkpoint on the left, upload an image, "
+            "and click **Run** to see predictions. V3-Emb produces the best results."
         )
         with gr.Row():
             with gr.Column(scale=1):
                 in_img  = gr.Image(type="pil", label="Upload Image")
             with gr.Column(scale=1):
                 out_lbl = gr.Label(num_top_classes=4, label="Predictions")
         run_btn.click(
             predict,
             inputs=[in_img, sel_ckpt, sel_interp],
             outputs=[out_lbl]
         )
         if not os.path.exists("examples"):
             os.makedirs("examples")
         example_files = [os.path.join("examples", f)
                          for f in os.listdir("examples")
                          if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
             )
     demo.launch()
 if __name__ == "__main__":
     launch()