Spaces:

telecomadm1145
/

AIDetectV2

Running

App Files Files Community

telecomadm1145 commited on Jun 18

Commit

cedb103

verified ·

1 Parent(s): 619e447

Update app.py

Browse files

Files changed (1) hide show

app.py +205 -240

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Swin-Large   AI vs. Non-AI Detector   (带多层注意力可视化)
 """
 import os
 import math
@@ -11,28 +11,30 @@ import timm
 import numpy as np
 from PIL import Image
 import gradio as gr
-from collections import defaultdict
 from huggingface_hub import hf_hub_download
 # --- Configuration ---------------------------------------------------------
 REPO_ID          = "telecomadm1145/swin-ai-detection"
 HF_FILENAME      = "swin_classifier_stage1_v2_epoch_3.pth"
 LOCAL_CKPT_DIR   = "./checkpoints"
-MODEL_NAME       = "swin_large_patch4_window12_384"
 NUM_CLASSES      = 2
 SEED             = 4421
 dropout_rate     = 0.1
-class_names = ["Non-AI Generated", "AI Generated"]
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch.manual_seed(SEED);  np.random.seed(SEED)
 print(f"Using device: {device}")
 # ---------------------------------------------------------------------------
-# 1. 模型结构（修改以支持注意力提取）
-class SwinClassifier(nn.Module):
     def __init__(self, model_name, num_classes, pretrained=True):
         super().__init__()
         self.backbone = timm.create_model(model_name, pretrained=pretrained,
@@ -52,34 +54,31 @@ class SwinClassifier(nn.Module):
             nn.Linear(128, num_classes)
         )
-        # 存储注意力权重
-        self.attention_maps = defaultdict(list)
         self.register_attention_hooks()
     def register_attention_hooks(self):
-        """注册钩子函数来捕获注意力权重"""
-        def get_attention_hook(layer_name):
             def hook(module, input, output):
-                # 对于 Swin Transformer，注意力在 attention 模块中
                 if hasattr(module, 'attn'):
                     # 获取注意力权重
-                    attn_weights = module.attn.attention_weights if hasattr(module.attn, 'attention_weights') else None
-                    if attn_weights is not None:
-                        self.attention_maps[layer_name].append(attn_weights.detach().cpu())
             return hook
-        # 为每个 stage 的每个 block 注册钩子
         for stage_idx, stage in enumerate(self.backbone.layers):
             for block_idx, block in enumerate(stage.blocks):
-                layer_name = f"stage_{stage_idx}_block_{block_idx}"
-                block.register_forward_hook(get_attention_hook(layer_name))
-    def clear_attention_maps(self):
-        """清空注意力映射"""
-        self.attention_maps.clear()
     def forward(self, x):
-        self.clear_attention_maps()
         feats = self.backbone(x)
         return self.classifier(feats)
@@ -88,282 +87,248 @@ class SwinClassifier(nn.Module):
 class AttentionExtractor:
     def __init__(self, model):
         self.model = model
-        self.hooks = []
         self.attention_maps = {}
-    def register_hooks(self):
-        """注册钩子来提取注意力权重"""
-        def get_attention_hook(name):
-            def hook(module, input, output):
-                # 提取 Window Attention 的注意力权重
-                if hasattr(module, 'qkv'):
-                    B, N, C = input[0].shape
-                    qkv = module.qkv(input[0]).reshape(B, N, 3, module.num_heads, C // module.num_heads).permute(2, 0, 3, 1, 4)
-                    q, k, v = qkv.unbind(0)
-                    attn = (q @ k.transpose(-2, -1)) * module.scale
-                    attn = attn.softmax(dim=-1)
-                    self.attention_maps[name] = attn.detach().cpu()
-            return hook
-        # 清除之前的钩子
-        self.clear_hooks()
-        # 为每个 stage 的每个 block 的 attention 模块注册钩子
-        for stage_idx, stage in enumerate(self.model.backbone.layers):
-            for block_idx, block in enumerate(stage.blocks):
-                if hasattr(block, 'attn'):
-                    name = f"stage_{stage_idx}_block_{block_idx}"
-                    hook = block.attn.register_forward_hook(get_attention_hook(name))
-                    self.hooks.append(hook)
-    def clear_hooks(self):
-        """清除所有钩子"""
-        for hook in self.hooks:
-            hook.remove()
-        self.hooks = []
-    def clear_attention_maps(self):
-        """清空注意力映射"""
-        self.attention_maps.clear()
-def create_attention_visualization(attention_weights, input_size, stage_info):
-    """
-    创建注意力可视化图
-    Args:
-        attention_weights: [B, num_heads, N, N] 注意力权重
-        input_size: 输入图像尺寸 (H, W)
-        stage_info: stage 信息，用于确定分辨率
-    """
-    if attention_weights is None or len(attention_weights) == 0:
-        return None
-    # 取第一个样本和所有头的平均
-    attn = attention_weights[0].mean(dim=0)  # [N, N]
-    # 获取 [CLS] token 对其他 token 的注意力（如果存在）
-    # 对于 Swin，通常没有 CLS token，所以我们计算平均注意力
-    attn_map = attn.mean(dim=0)  # [N]
-    # 确定特征图的尺寸
-    N = attn_map.shape[0]
-    feat_size = int(math.sqrt(N))
-    if feat_size * feat_size != N:
-        # 如果不是完全平方数，可能包含了额外的 token
-        feat_size = int(math.sqrt(N))
-        attn_map = attn_map[:feat_size*feat_size]
-    # 重塑为 2D
-    attn_2d = attn_map.reshape(feat_size, feat_size)
-    # 转换为 numpy 并归一化
-    attn_np = attn_2d.numpy()
-    attn_np = (attn_np - attn_np.min()) / (attn_np.max() - attn_np.min() + 1e-8)
-    # 调整到输入图像尺寸
-    attn_img = Image.fromarray((attn_np * 255).astype(np.uint8), mode='L')
-    attn_img = attn_img.resize(input_size, Image.Resampling.BILINEAR)
-    return attn_img
 # ---------------------------------------------------------------------------
-# 下载和加载模型
 print("⏬ Download / cache checkpoint …")
 ckpt_path = hf_hub_download(
     repo_id      = REPO_ID,
     filename     = HF_FILENAME,
     local_dir    = LOCAL_CKPT_DIR,
-    force_download=False
 )
 print(f"Checkpoint path: {ckpt_path}")
-model = SwinClassifier(MODEL_NAME, NUM_CLASSES, pretrained=False).to(device)
 state = torch.load(ckpt_path, map_location=device, weights_only=False)
-model.load_state_dict(state.get("model_state_dict", state), strict=True)
 model.eval()
 print("✅  Model loaded.")
-# 创建注意力提取器
 attention_extractor = AttentionExtractor(model)
 # ---------------------------------------------------------------------------
-# 构建变换函数
 def build_transform(is_training: bool, interpolation: str):
     cfg = model.data_config.copy()
     cfg.update(dict(interpolation=interpolation))
     return timm.data.create_transform(**cfg, is_training=is_training)
 # ---------------------------------------------------------------------------
-# 推理函数
-@torch.no_grad()
-def infer(image_pil: Image.Image,
-          interpolation: str = "bilinear",
-          show_attention: bool = True,
-          selected_layer: str = "stage_3_block_1"):
     if image_pil is None:
-        return None, None, "请上传图片"
     transform = build_transform(is_training=False, interpolation=interpolation)
     input_tensor = transform(image_pil).unsqueeze(0).to(device)
-    # 注册钩子
-    if show_attention:
-        attention_extractor.register_hooks()
-        attention_extractor.clear_attention_maps()
-    # 分类预测
-    logits = model(input_tensor)
-    probs = F.softmax(logits, dim=1)[0]
-    confidences = {class_names[i]: float(probs[i]) for i in range(NUM_CLASSES)}
-    if not show_attention:
-        attention_extractor.clear_hooks()
-        return confidences, None, "分类完成"
-    # 获取选定层的注意力
-    layer_info = f"当前显示层: {selected_layer}"
-    if selected_layer in attention_extractor.attention_maps:
-        attention_weights = attention_extractor.attention_maps[selected_layer]
-        # 获取 stage 信息来确定分辨率
-        stage_num = int(selected_layer.split('_')[1])
-        stage_info = {'stage': stage_num}
-        attention_img = create_attention_visualization(
-            attention_weights, image_pil.size, stage_info
-        )
-        # 清理钩子
-        attention_extractor.clear_hooks()
-        if attention_img is not None:
-            # 创建彩色热力图
-            attention_colored = Image.new('RGB', image_pil.size)
-            attn_array = np.array(attention_img)
-            # 创建热力图 (红色表示高注意力)
-            colored_array = np.zeros((*attn_array.shape, 3), dtype=np.uint8)
-            colored_array[:, :, 0] = attn_array  # 红色通道
-            colored_array[:, :, 1] = attn_array // 2  # 绿色通道（减弱）
-            attention_colored = Image.fromarray(colored_array)
-            # 与原图混合
-            blended = Image.blend(image_pil.convert('RGB'), attention_colored, alpha=0.4)
-            return confidences, blended, f"{layer_info} - 注意力可视化完成"
         else:
-            return confidences, None, f"{layer_info} - 注意力提取失败"
     else:
-        available_layers = list(attention_extractor.attention_maps.keys())
-        attention_extractor.clear_hooks()
-        return confidences, None, f"层 {selected_layer} 不可用。可用层: {available_layers[:5]}..."
-# ---------------------------------------------------------------------------
-# 获取可用的层列表
-def get_available_layers():
-    """获取所有可用的注意力层"""
-    layers = []
-    for stage_idx in range(4):  # Swin-Large 有 4 个 stage
-        # 每个 stage 的 block 数量
-        stage_blocks = [2, 2, 18, 2]  # Swin-Large 的配置
-        for block_idx in range(stage_blocks[stage_idx]):
-            layers.append(f"stage_{stage_idx}_block_{block_idx}")
-    return layers
 # ---------------------------------------------------------------------------
-# Gradio UI
 def launch_app():
-    available_layers = get_available_layers()
-    with gr.Blocks(title="AI Image Detector with Attention") as demo:
         gr.Markdown("""
-# 🖼️ AI vs. Non-AI Image Classifier (Swin-Large + Attention Visualization)
-🔍 基于 Swin-Large 的 AI 图片检测器，使用注意力机制可视化模型关注的区域
-## 功能特点:
-- 🎯 使用 Swin Transformer 原生注意力权重
-- 🎨 可视化不同层的注意力模式
-- 🔄 支持多种插值方法优化预处理
-## 使用说明:
-1. 上传图片
-2. 选择要可视化的注意力层
-3. 选择插值方法（推荐 bicubic）
-4. 点击运行
-## 注意事项:
-- 工具仅供研究和教育用途
-- 不保证 100% 准确率
-- 请负责任地使用
 """)
         with gr.Row():
-            with gr.Column():
-                in_img = gr.Image(type="pil", label="上传图片")
-                with gr.Row():
-                    interp_choice = gr.Radio(
-                        ["bilinear", "bicubic", "nearest"],
-                        value="bicubic",
-                        label="插值方法"
-                    )
-                attention_toggle = gr.Checkbox(
-                    value=True,
-                    label="显示注意力可视化"
-                )
-                layer_choice = gr.Dropdown(
-                    choices=available_layers,
-                    value="stage_3_block_1",
-                    label="选择注意力层",
-                    info="不同层关注不同级别的特征"
-                )
-                run_btn = gr.Button("🚀 开始检测", variant="primary")
-            with gr.Column():
-                out_lbl = gr.Label(
-                    num_top_classes=2,
-                    label="分类结果"
-                )
-                out_attention = gr.Image(
-                    type="pil",
-                    label="注意力可视化"
-                )
-                status_text = gr.Textbox(
-                    label="状态信息",
-                    interactive=False
-                )
-        # 层选择说明
-        gr.Markdown("""
-### 层选择指南:
-- **Stage 0-1**: 关注底层特征（边缘、纹理）
-- **Stage 2**: 关注中层特征（形状、局部模式）
-- **Stage 3**: 关注高层特征（语义、全局结构）
-推荐从 `stage_3_block_1` 开始尝试，然后对比不同层的关注点。
-""")
-        def _run(img, inter, attention_flag, layer):
-            return infer(img, interpolation=inter,
-                        show_attention=attention_flag,
-                        selected_layer=layer)
         run_btn.click(
             _run,
-            inputs=[in_img, interp_choice, attention_toggle, layer_choice],
-            outputs=[out_lbl, out_attention, status_text]
         )
     demo.launch()
 # ---------------------------------------------------------------------------

 # -*- coding: utf-8 -*-
 """
+Swin-Large   AI vs. Non-AI Detector   (基于注意力机制可视化)
 """
 import os
 import math
 import numpy as np
 from PIL import Image
 import gradio as gr
 from huggingface_hub import hf_hub_download
+from torchvision import transforms
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
 # --- Configuration ---------------------------------------------------------
 REPO_ID          = "telecomadm1145/swin-ai-detection"
 HF_FILENAME      = "swin_classifier_stage1_v2_epoch_3.pth"
 LOCAL_CKPT_DIR   = "./checkpoints"
+MODEL_NAME       = "swin_large_patch4_window12_384"     # ← 使用 large
 NUM_CLASSES      = 2
 SEED             = 4421
 dropout_rate     = 0.1
+class_names = ["Non-AI Generated", "AI Generated"]     # 0, 1
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch.manual_seed(SEED);  np.random.seed(SEED)
 print(f"Using device: {device}")
 # ---------------------------------------------------------------------------
+# 1. 修改模型结构以提取注意力
+class SwinClassifierWithAttention(nn.Module):
     def __init__(self, model_name, num_classes, pretrained=True):
         super().__init__()
         self.backbone = timm.create_model(model_name, pretrained=pretrained,
             nn.Linear(128, num_classes)
         )
+        # 存储注意力权重的钩子
+        self.attention_weights = {}
         self.register_attention_hooks()
     def register_attention_hooks(self):
+        """注册钩子函数来提取注意力权重"""
+        def hook_fn(name):
             def hook(module, input, output):
+                # 对于Swin Transformer的窗口注意力机制
+                # output通常是 (B, N, C) 格式
                 if hasattr(module, 'attn'):
                     # 获取注意力权重
+                    self.attention_weights[name] = module.attn.attention_weights
             return hook
+        # 为每个stage的每个block注册钩子
         for stage_idx, stage in enumerate(self.backbone.layers):
             for block_idx, block in enumerate(stage.blocks):
+                hook_name = f"stage_{stage_idx}_block_{block_idx}"
+                if hasattr(block, 'attn'):
+                    block.attn.register_forward_hook(hook_fn(hook_name))
     def forward(self, x):
+        # 清空之前的注意力权重
+        self.attention_weights = {}
         feats = self.backbone(x)
         return self.classifier(feats)
 class AttentionExtractor:
     def __init__(self, model):
         self.model = model
         self.attention_maps = {}
+    def extract_attention_weights(self, x):
+        """提取所有层的注意力权重"""
+        with torch.no_grad():
+            _ = self.model(x)  # 前向传播以触发钩子
+            return self.model.attention_weights.copy()
+    def process_attention_for_visualization(self, attention_weights, input_size):
+        """处理注意力权重用于可视化"""
+        processed_maps = {}
+        for layer_name, attn_weight in attention_weights.items():
+            if attn_weight is None:
+                continue
+            # attn_weight shape: [batch_size, num_heads, seq_len, seq_len]
+            if len(attn_weight.shape) == 4:
+                # 取平均池化所有注意力头
+                attn_map = attn_weight.mean(dim=1)  # [batch_size, seq_len, seq_len]
+                # 取第一个样本
+                attn_map = attn_map[0]  # [seq_len, seq_len]
+                # 对于自注意力，我们通常关注CLS token对其他token的注意力
+                # 或者计算所有token的平均注意力
+                if attn_map.shape[0] > 1:
+                    # 计算每个位置的平均注意力分数
+                    avg_attention = attn_map.mean(dim=0)  # [seq_len]
+                    # 将注意力分数reshape为2D特征图
+                    seq_len = avg_attention.shape[0]
+                    grid_size = int(math.sqrt(seq_len))
+                    if grid_size * grid_size == seq_len:
+                        attention_2d = avg_attention.reshape(grid_size, grid_size)
+                        processed_maps[layer_name] = attention_2d
+        return processed_maps
 # ---------------------------------------------------------------------------
+# 3. 下载 / 缓存 checkpoint
 print("⏬ Download / cache checkpoint …")
 ckpt_path = hf_hub_download(
     repo_id      = REPO_ID,
     filename     = HF_FILENAME,
     local_dir    = LOCAL_CKPT_DIR,
+    force_download=False     # 已存在则直接用
 )
 print(f"Checkpoint path: {ckpt_path}")
+# ---------------------------------------------------------------------------
+# 4. 实例化 & 加载权重
+model = SwinClassifierWithAttention(MODEL_NAME, NUM_CLASSES, pretrained=False).to(device)
 state = torch.load(ckpt_path, map_location=device, weights_only=False)
+model.load_state_dict(state.get("model_state_dict", state), strict=False)  # strict=False 因为添加了新的组件
 model.eval()
 print("✅  Model loaded.")
 attention_extractor = AttentionExtractor(model)
 # ---------------------------------------------------------------------------
+# 5. 变换函数
 def build_transform(is_training: bool, interpolation: str):
+    """
+    根据插值方式(双线性 / 三次等)构建 timm 默认变换
+    """
     cfg = model.data_config.copy()
     cfg.update(dict(interpolation=interpolation))
     return timm.data.create_transform(**cfg, is_training=is_training)
 # ---------------------------------------------------------------------------
+# 6. 注意力可视化函数
+def visualize_attention(attention_map, original_image, normalize=True):
+    """将注意力图可视化到原始图像上"""
+    if normalize:
+        # 归一化注意力图到[0,1]
+        attention_map = attention_map.cpu().numpy()
+        attention_map = (attention_map - attention_map.min()) / (attention_map.max() - attention_map.min() + 1e-8)
+    else:
+        attention_map = attention_map.cpu().numpy()
+    # 调整注意力图大小到原始图像大小
+    attention_resized = Image.fromarray((attention_map * 255).astype(np.uint8)) \
+                            .resize(original_image.size, Image.Resampling.BILINEAR)
+    # 转换为热力图
+    attention_array = np.array(attention_resized) / 255.0
+    heatmap = cm.jet(attention_array)[:, :, :3]  # 去掉alpha通道
+    # 叠加到原始图像
+    original_array = np.array(original_image) / 255.0
+    if len(original_array.shape) == 3:
+        overlay = 0.6 * original_array + 0.4 * heatmap
+    else:
+        # 灰度图像处理
+        original_array = np.stack([original_array] * 3, axis=-1)
+        overlay = 0.6 * original_array + 0.4 * heatmap
+    overlay = np.clip(overlay, 0, 1)
+    return Image.fromarray((overlay * 255).astype(np.uint8))
+# ---------------------------------------------------------------------------
+# 7. 推理 + 注意力可视化
+def infer_with_attention(image_pil: Image.Image,
+                        interpolation: str = "bilinear",
+                        attention_layer: str = "stage_3_block_1",
+                        stage_average: bool = False,
+                        normalize_attention: bool = True):
     if image_pil is None:
+        return None, None
     transform = build_transform(is_training=False, interpolation=interpolation)
     input_tensor = transform(image_pil).unsqueeze(0).to(device)
+    # (1) 分类预测
+    with torch.no_grad():
+        logits = model(input_tensor)
+        probs  = F.softmax(logits, dim=1)[0]
+        confidences = {class_names[i]: float(probs[i]) for i in range(NUM_CLASSES)}
+    # (2) 提取注意力权重
+    attention_weights = attention_extractor.extract_attention_weights(input_tensor)
+    if not attention_weights:
+        return confidences, None
+    # (3) 处理注意力权重
+    processed_attention = attention_extractor.process_attention_for_visualization(
+        attention_weights, input_tensor.shape[-2:]
+    )
+    if not processed_attention:
+        return confidences, None
+    # (4) 选择要可视化的注意力层
+    if stage_average:
+        # 计算指定stage所有block的平均注意力
+        stage_num = attention_layer.split('_')[1]
+        stage_attentions = []
+        for layer_name, attn_map in processed_attention.items():
+            if f"stage_{stage_num}_" in layer_name:
+                stage_attentions.append(attn_map)
+        if stage_attentions:
+            # 计算平均注意力
+            avg_attention = torch.stack(stage_attentions).mean(dim=0)
+            attention_vis = visualize_attention(avg_attention, image_pil, normalize_attention)
         else:
+            return confidences, None
     else:
+        # 使用指定层的注意力
+        if attention_layer in processed_attention:
+            attention_vis = visualize_attention(
+                processed_attention[attention_layer], image_pil, normalize_attention
+            )
+        else:
+            # 如果指定层不存在，使用第一个可用的层
+            first_layer = list(processed_attention.keys())[0]
+            attention_vis = visualize_attention(
+                processed_attention[first_layer], image_pil, normalize_attention
+            )
+    return confidences, attention_vis
 # ---------------------------------------------------------------------------
+# 8. Gradio UI
 def launch_app():
+    with gr.Blocks() as demo:
         gr.Markdown("""
+# 🖼️ AI vs. Non-AI Image Classifier  (Swin-Large + Attention Visualization)
+🖼️ AI 鉴别器(基于 Swin-Large 视觉骨干，输出注意力热力图)
+基于Swin Transformer的自注意力机制来可视化模型关注的区域。
+Notice: 使用 bicubic 效果较好。请负责任地使用此工具。
+此工具仅供研究和教育用途。
 """)
         with gr.Row():
+            in_img = gr.Image(type="pil", label="Upload an Image")
+            out_attention = gr.Image(type="pil", label="Attention Heatmap")
+        with gr.Row():
+            out_lbl = gr.Label(num_top_classes=2, label="Predictions")
+        with gr.Row():
+            interp_choice = gr.Radio(
+                ["bilinear", "bicubic", "nearest"], value="bicubic",
+                label="Resize Interpolation (预处理插值)"
+            )
+        with gr.Row():
+            attention_layer_choice = gr.Dropdown(
+                choices=[
+                    "stage_0_block_0", "stage_0_block_1",
+                    "stage_1_block_0", "stage_1_block_1",
+                    "stage_2_block_0", "stage_2_block_1", "stage_2_block_2",
+                    "stage_3_block_0", "stage_3_block_1"
+                ],
+                value="stage_3_block_1",
+                label="选择注意力层 (Attention Layer)"
+            )
+        with gr.Row():
+            stage_avg_toggle = gr.Checkbox(
+                value=False,
+                label="计算整个Stage的平均注意力 (Average Stage Attention)"
+            )
+            normalize_toggle = gr.Checkbox(
+                value=True,
+                label="归一化注意力 (Normalize Attention)"
+            )
+        run_btn = gr.Button("🚀 Run Analysis")
+        def _run(img, inter, attn_layer, stage_avg, normalize):
+            return infer_with_attention(
+                img,
+                interpolation=inter,
+                attention_layer=attn_layer,
+                stage_average=stage_avg,
+                normalize_attention=normalize
+            )
         run_btn.click(
             _run,
+            inputs=[in_img, interp_choice, attention_layer_choice, stage_avg_toggle, normalize_toggle],
+            outputs=[out_lbl, out_attention]
         )
+        gr.Markdown("""
+### 说明：
+- **注意力层选择**: 可以选择不同的Swin Transformer层来查看注意力模式
+- **Stage平均**: 勾选后会计算选中stage中所有block的平均注意力
+- **归一化**: 将注意力值归一化到0-1范围内，便于可视化
+- **热力图**: 红色区域表示模型更关注的区域，蓝色区域表示关注度较低的区域
+        """)
     demo.launch()
 # ---------------------------------------------------------------------------