Spaces:

telecomadm1145
/

AIDetectV2

Running

App Files Files Community

telecomadm1145 commited on Oct 31

Commit

9094a6f

verified ·

1 Parent(s): 8e53ad8

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -39

app.py CHANGED Viewed

@@ -42,10 +42,13 @@ CKPT_META = {
             "names": ["non_ai", "ai", "ani_non_ai", "ani_ai"]},
     "V2.5-CAFormer": { "n_cls": 4, "head": "v7", "backbone": "caformer_b36.sail_in22k_ft_in1k_384",
             "names": ["non_ai", "ai", "ani_non_ai", "ani_ai"]},
     "EmbeddingClassifier": {
         "n_cls": 2,
         "head": "embedding_classifier",
-        "backbone": "hf-hub:SmilingWolf/wd-swinv2-tagger-v3",
         "names": ["Non-AI Generated", "AI Generated"]
     }
 }
@@ -60,7 +63,6 @@ print(f"Using device: {device}")
 model, current_ckpt = None, None
 current_meta = None
-# --- EmbeddingClassifier Model ---
 class EmbeddingClassifier(nn.Module):
     def __init__(self, input_dim=1024, hidden_dim1=512, hidden_dim2=256, output_dim=1):
         super().__init__()
@@ -79,28 +81,32 @@ class EmbeddingClassifier(nn.Module):
     def forward(self, x):
         return self.net(x)
 class EmbeddingClassifierModel(nn.Module):
-    def __init__(self, backbone_name, num_classes):
         super().__init__()
-        self.backbone = timm.create_model(backbone_name, pretrained=True, num_classes=0,strict=False)
         self.data_config = timm.data.resolve_data_config({}, model=self.backbone)
-        # This specific model uses a binary classifier with a single output logit
-        # that indicates if an image is real (Non-AI). So, output_dim is fixed to 1.
-        self.classifier = EmbeddingClassifier(input_dim=1024, hidden_dim1=512, hidden_dim2=256, output_dim=1)
     def forward(self, x):
         features = self.backbone(x)
-        # The classifier returns a single logit. A positive value indicates "real" (Non-AI).
-        # This corresponds to class 0 in `current_meta["names"]`.
-        logit_class0 = self.classifier(features)
-        # To maintain compatibility with the `predict` function which expects multi-class logits
-        # and applies softmax, we construct a 2-class logit tensor.
-        # We assume the logit for the other class (AI-generated, class 1) is 0.
-        logit_class1 = torch.zeros_like(logit_class0)
-        # The final logits are for ["Non-AI", "AI"], i.e., [logit_class0, logit_class1].
-        return torch.cat([logit_class0, logit_class1], dim=1)
 # --- Original DINOv2 Classifier (Weighted Attention Pooling) ---
 class DINOv2Classifier_WeightedPool(nn.Module):
@@ -234,22 +240,48 @@ def load_model(ckpt_name: str):
     meta = CKPT_META[ckpt_name]
     ckpt_filename = HF_FILENAMES[ckpt_name]
-    ckpt_file = hf_hub_download(
-        repo_id=REPO_ID,
-        filename=ckpt_filename,
-        local_dir=LOCAL_CKPT_DIR, force_download=False
-    )
-    print(f"Checkpoint: {ckpt_file}")
-    # Build model structure based on model_type or head
     head_version = meta.get("head", "v4")
     if head_version == "embedding_classifier":
         model = EmbeddingClassifierModel(
-            backbone_name=meta["backbone"],
             num_classes=meta["n_cls"]
         ).to(device)
     else:
-        # Existing logic for other models
         model_type = meta.get("model_type")
         if model_type == "dinov2_weighted_pool":
             model = DINOv2Classifier_WeightedPool(
@@ -269,18 +301,12 @@ def load_model(ckpt_name: str):
                 head_version=head_version
             ).to(device)
-    # Compatible load for .pth and .safetensors
-    if ckpt_filename.endswith(".safetensors"):
-        state = load_file(ckpt_file, device=device)
-    else:
-        state = torch.load(ckpt_file, map_location=device, weights_only=False)
-    # Load state dict
-    if head_version == "embedding_classifier":
-        # For EmbeddingClassifierModel, we need to load the state dict for the classifier part
-        # Assuming the checkpoint only contains the classifier state dict
-        model.classifier.load_state_dict(state)
-    else:
         model.load_state_dict(state.get("model_state_dict", state), strict=True)
     model.eval()
@@ -318,7 +344,13 @@ def predict(image: Image.Image,
         tfm = build_transform(False, interpolation)
     inp = tfm(image).unsqueeze(0).to(device)
-    probs = F.softmax(model(inp), dim=1)[0].cpu()
     class_names = current_meta["names"]
     # 保证 gr.Label 在 2 / 4 类都能正常显示
     return {class_names[i]: float(probs[i])

             "names": ["non_ai", "ai", "ani_non_ai", "ani_ai"]},
     "V2.5-CAFormer": { "n_cls": 4, "head": "v7", "backbone": "caformer_b36.sail_in22k_ft_in1k_384",
             "names": ["non_ai", "ai", "ani_non_ai", "ani_ai"]},
+    # --- MODIFIED: Added specific keys for the new loading logic ---
     "EmbeddingClassifier": {
         "n_cls": 2,
         "head": "embedding_classifier",
+        "timm_model_name": "swinv2_base_window8_256.ms_in1k",
+        "backbone_repo_id": "SmilingWolf/wd-swinv2-tagger-v3",
+        "backbone_filename": "model.safetensors",
         "names": ["Non-AI Generated", "AI Generated"]
     }
 }
 model, current_ckpt = None, None
 current_meta = None
 class EmbeddingClassifier(nn.Module):
     def __init__(self, input_dim=1024, hidden_dim1=512, hidden_dim2=256, output_dim=1):
         super().__init__()
     def forward(self, x):
         return self.net(x)
+# MODIFIED: Changed __init__ to accept timm_model_name and use pretrained=False
 class EmbeddingClassifierModel(nn.Module):
+    def __init__(self, timm_model_name, num_classes):
         super().__init__()
+        self.backbone = timm.create_model(timm_model_name, pretrained=False, num_classes=0)
         self.data_config = timm.data.resolve_data_config({}, model=self.backbone)
+        self.classifier = EmbeddingClassifier(input_dim=self.backbone.num_features)
     def forward(self, x):
         features = self.backbone(x)
+        # The classifier returns a single value (probability of being Non-AI)
+        prob_class0 = self.classifier(features)
+        # To maintain compatibility with the `predict` function which expects multi-class outputs
+        # and applies softmax, we construct a 2-class output.
+        # prob_class1 is simply 1 - prob_class0
+        prob_class1 = 1 - prob_class0
+        # The final output is for ["Non-AI", "AI"], i.e., [prob_class0, prob_class1].
+        # The softmax in predict() will be applied to this, so we should return logits.
+        # However, since the original output is a sigmoid, we can work with probabilities
+        # and just return them directly. The gr.Label will normalize this.
+        # A simpler way is to construct logits that would result in these probabilities.
+        # Let's stick to the original logic's output format.
+        return torch.cat([prob_class0, prob_class1], dim=1)
 # --- Original DINOv2 Classifier (Weighted Attention Pooling) ---
 class DINOv2Classifier_WeightedPool(nn.Module):
     meta = CKPT_META[ckpt_name]
     ckpt_filename = HF_FILENAMES[ckpt_name]
+    # --- MODIFIED: Special handling for EmbeddingClassifier ---
     head_version = meta.get("head", "v4")
     if head_version == "embedding_classifier":
+        # 1. Create the model structure with a non-pretrained backbone
+        print(f"Creating backbone: {meta['timm_model_name']}")
         model = EmbeddingClassifierModel(
+            timm_model_name=meta["timm_model_name"],
             num_classes=meta["n_cls"]
         ).to(device)
+        # 2. Download and load backbone weights from SmilingWolf's repo
+        print(f"Loading backbone weights from {meta['backbone_repo_id']}...")
+        backbone_ckpt_file = hf_hub_download(
+            repo_id=meta["backbone_repo_id"],
+            filename=meta["backbone_filename"],
+            local_dir=LOCAL_CKPT_DIR, force_download=False
+        )
+        backbone_state = load_file(backbone_ckpt_file, device=device)
+        model.backbone.load_state_dict(backbone_state)
+        print("✅ Backbone weights loaded.")
+        # 3. Download and load classifier (head) weights from the main repo
+        print(f"Loading classifier head weights from {REPO_ID}...")
+        classifier_ckpt_file = hf_hub_download(
+            repo_id=REPO_ID,
+            filename=ckpt_filename, # This is 'swinv2_v3_v1.pth'
+            local_dir=LOCAL_CKPT_DIR, force_download=False
+        )
+        classifier_state = torch.load(classifier_ckpt_file, map_location=device, weights_only=False)
+        model.classifier.load_state_dict(classifier_state)
+        print("✅ Classifier head weights loaded.")
     else:
+        # --- Original logic for all other models ---
+        ckpt_file = hf_hub_download(
+            repo_id=REPO_ID,
+            filename=ckpt_filename,
+            local_dir=LOCAL_CKPT_DIR, force_download=False
+        )
+        print(f"Checkpoint: {ckpt_file}")
+        # Build model structure based on model_type or head
         model_type = meta.get("model_type")
         if model_type == "dinov2_weighted_pool":
             model = DINOv2Classifier_WeightedPool(
                 head_version=head_version
             ).to(device)
+        # Compatible load for .pth and .safetensors
+        if ckpt_filename.endswith(".safetensors"):
+            state = load_file(ckpt_file, device=device)
+        else:
+            state = torch.load(ckpt_file, map_location=device, weights_only=False)
         model.load_state_dict(state.get("model_state_dict", state), strict=True)
     model.eval()
         tfm = build_transform(False, interpolation)
     inp = tfm(image).unsqueeze(0).to(device)
+    # MODIFIED: For EmbeddingClassifier, the output is already probabilities, no need for softmax.
+    # For others, softmax is needed.
+    if current_meta["head"] == "embedding_classifier":
+        probs = model(inp)[0].cpu()
+    else:
+        probs = F.softmax(model(inp), dim=1)[0].cpu()
     class_names = current_meta["names"]
     # 保证 gr.Label 在 2 / 4 类都能正常显示
     return {class_names[i]: float(probs[i])