Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						e88a846
	
1
								Parent(s):
							
							563f033
								
Add FantasyTalking Hugging Face Space demo with complete deployment guide
Browse files- .gitignore +89 -0
- DEPLOYMENT.md +161 -0
- README.md +87 -6
- app.py +219 -0
- assets/README.md +7 -0
- deploy.py +129 -0
- infer.py +168 -0
- model.py +99 -0
- requirements.txt +15 -0
- utils.py +70 -0
    	
        .gitignore
    ADDED
    
    | @@ -0,0 +1,89 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Gitignore for FantasyTalking project
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # Python
         | 
| 4 | 
            +
            __pycache__/
         | 
| 5 | 
            +
            *.py[cod]
         | 
| 6 | 
            +
            *$py.class
         | 
| 7 | 
            +
            *.so
         | 
| 8 | 
            +
            .Python
         | 
| 9 | 
            +
            build/
         | 
| 10 | 
            +
            develop-eggs/
         | 
| 11 | 
            +
            dist/
         | 
| 12 | 
            +
            downloads/
         | 
| 13 | 
            +
            eggs/
         | 
| 14 | 
            +
            .eggs/
         | 
| 15 | 
            +
            lib/
         | 
| 16 | 
            +
            lib64/
         | 
| 17 | 
            +
            parts/
         | 
| 18 | 
            +
            sdist/
         | 
| 19 | 
            +
            var/
         | 
| 20 | 
            +
            wheels/
         | 
| 21 | 
            +
            pip-wheel-metadata/
         | 
| 22 | 
            +
            share/python-wheels/
         | 
| 23 | 
            +
            *.egg-info/
         | 
| 24 | 
            +
            .installed.cfg
         | 
| 25 | 
            +
            *.egg
         | 
| 26 | 
            +
            MANIFEST
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            # PyTorch
         | 
| 29 | 
            +
            *.pth
         | 
| 30 | 
            +
            *.pt
         | 
| 31 | 
            +
            *.ckpt
         | 
| 32 | 
            +
            *.safetensors
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            # Model files
         | 
| 35 | 
            +
            models/
         | 
| 36 | 
            +
            *.bin
         | 
| 37 | 
            +
            *.h5
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            # Output
         | 
| 40 | 
            +
            output/
         | 
| 41 | 
            +
            results/
         | 
| 42 | 
            +
            *.mp4
         | 
| 43 | 
            +
            *.avi
         | 
| 44 | 
            +
            *.mov
         | 
| 45 | 
            +
            *.mkv
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            # Jupyter Notebook
         | 
| 48 | 
            +
            .ipynb_checkpoints
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            # Environment
         | 
| 51 | 
            +
            .env
         | 
| 52 | 
            +
            .venv
         | 
| 53 | 
            +
            env/
         | 
| 54 | 
            +
            venv/
         | 
| 55 | 
            +
            ENV/
         | 
| 56 | 
            +
            env.bak/
         | 
| 57 | 
            +
            venv.bak/
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            # IDE
         | 
| 60 | 
            +
            .vscode/
         | 
| 61 | 
            +
            .idea/
         | 
| 62 | 
            +
            *.swp
         | 
| 63 | 
            +
            *.swo
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            # OS
         | 
| 66 | 
            +
            .DS_Store
         | 
| 67 | 
            +
            .DS_Store?
         | 
| 68 | 
            +
            ._*
         | 
| 69 | 
            +
            .Spotlight-V100
         | 
| 70 | 
            +
            .Trashes
         | 
| 71 | 
            +
            ehthumbs.db
         | 
| 72 | 
            +
            Thumbs.db
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            # Logs
         | 
| 75 | 
            +
            *.log
         | 
| 76 | 
            +
            logs/
         | 
| 77 | 
            +
             | 
| 78 | 
            +
            # Cache
         | 
| 79 | 
            +
            .cache/
         | 
| 80 | 
            +
            .huggingface/
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            # Gradio
         | 
| 83 | 
            +
            gradio_cached_examples/
         | 
| 84 | 
            +
            flagged/
         | 
| 85 | 
            +
             | 
| 86 | 
            +
            # Large files
         | 
| 87 | 
            +
            *.zip
         | 
| 88 | 
            +
            *.tar.gz
         | 
| 89 | 
            +
            *.tar.bz2
         | 
    	
        DEPLOYMENT.md
    ADDED
    
    | @@ -0,0 +1,161 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # FantasyTalking Hugging Face Space 部署指南
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            本项目是FantasyTalking的Hugging Face Space演示版本,由于模型体积巨大(40GB+)和GPU内存需求,在线Space主要展示界面,完整功能需要本地部署。
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## 🚀 在Hugging Face Space中部署
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            ### 方法1: 直接复制项目
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            1. 登录 [Hugging Face](https://huggingface.co/)
         | 
| 10 | 
            +
            2. 创建新的Space: https://huggingface.co/new-space
         | 
| 11 | 
            +
            3. 选择Gradio SDK
         | 
| 12 | 
            +
            4. 将本项目所有文件上传到Space
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            ### 方法2: 从GitHub导入
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            1. Fork原始仓库: https://github.com/Fantasy-AMAP/fantasy-talking
         | 
| 17 | 
            +
            2. 在Hugging Face创建Space时选择"Import from GitHub"
         | 
| 18 | 
            +
            3. 输入你的GitHub仓库地址
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            ### Space配置
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            确保在Space的README.md中包含以下配置:
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            ```yaml
         | 
| 25 | 
            +
            ---
         | 
| 26 | 
            +
            title: FantasyTalking Demo
         | 
| 27 | 
            +
            emoji: 🎬
         | 
| 28 | 
            +
            colorFrom: blue
         | 
| 29 | 
            +
            colorTo: purple
         | 
| 30 | 
            +
            sdk: gradio
         | 
| 31 | 
            +
            sdk_version: 5.34.2
         | 
| 32 | 
            +
            app_file: app.py
         | 
| 33 | 
            +
            pinned: false
         | 
| 34 | 
            +
            license: apache-2.0
         | 
| 35 | 
            +
            ---
         | 
| 36 | 
            +
            ```
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            ## 💻 本地完整部署
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            ### 环境要求
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            - **操作系统**: Linux/Windows/macOS
         | 
| 43 | 
            +
            - **Python**: 3.8+
         | 
| 44 | 
            +
            - **GPU**: NVIDIA GPU with CUDA
         | 
| 45 | 
            +
            - **VRAM**: 至少5GB(推荐20GB+)
         | 
| 46 | 
            +
            - **存储**: 50GB+可用空间
         | 
| 47 | 
            +
            - **内存**: 16GB+
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            ### 快速部署
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            ```bash
         | 
| 52 | 
            +
            # 1. 克隆仓库
         | 
| 53 | 
            +
            git clone https://github.com/Fantasy-AMAP/fantasy-talking.git
         | 
| 54 | 
            +
            cd fantasy-talking
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            # 2. 自动部署(推荐)
         | 
| 57 | 
            +
            python deploy.py
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            # 3. 手动部署
         | 
| 60 | 
            +
            # 安装依赖
         | 
| 61 | 
            +
            pip install -r requirements.txt
         | 
| 62 | 
            +
            pip install flash_attn  # 可选,需要CUDA
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            # 下载模型
         | 
| 65 | 
            +
            huggingface-cli download Wan-AI/Wan2.1-I2V-14B-720P --local-dir ./models/Wan2.1-I2V-14B-720P
         | 
| 66 | 
            +
            huggingface-cli download facebook/wav2vec2-base-960h --local-dir ./models/wav2vec2-base-960h
         | 
| 67 | 
            +
            huggingface-cli download acvlab/FantasyTalking fantasytalking_model.ckpt --local-dir ./models
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            # 运行应用
         | 
| 70 | 
            +
            python app.py
         | 
| 71 | 
            +
            ```
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            ### Docker部署
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            ```dockerfile
         | 
| 76 | 
            +
            FROM nvidia/cuda:11.8-devel-ubuntu20.04
         | 
| 77 | 
            +
             | 
| 78 | 
            +
            RUN apt-get update && apt-get install -y \
         | 
| 79 | 
            +
                python3 python3-pip git ffmpeg \
         | 
| 80 | 
            +
                && rm -rf /var/lib/apt/lists/*
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            WORKDIR /app
         | 
| 83 | 
            +
            COPY requirements.txt .
         | 
| 84 | 
            +
            RUN pip3 install -r requirements.txt
         | 
| 85 | 
            +
             | 
| 86 | 
            +
            COPY . .
         | 
| 87 | 
            +
            CMD ["python3", "app.py"]
         | 
| 88 | 
            +
            ```
         | 
| 89 | 
            +
             | 
| 90 | 
            +
            ## 🔧 配置选项
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            ### 内存优化
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            根据GPU内存调整`num_persistent_param_in_dit`参数:
         | 
| 95 | 
            +
             | 
| 96 | 
            +
            - **40GB+ VRAM**: `None` (无限制,最快)
         | 
| 97 | 
            +
            - **20GB VRAM**: `7000000000` (7B参数)
         | 
| 98 | 
            +
            - **5GB VRAM**: `0` (最省内存,较慢)
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            ### 模型精度
         | 
| 101 | 
            +
             | 
| 102 | 
            +
            - `torch.bfloat16`: 推荐,平衡速度和质量
         | 
| 103 | 
            +
            - `torch.float16`: 更快,可能影响质量
         | 
| 104 | 
            +
            - `torch.float32`: 最高质量,需要更多内存
         | 
| 105 | 
            +
             | 
| 106 | 
            +
            ## 📊 性能参考
         | 
| 107 | 
            +
             | 
| 108 | 
            +
            | 配置 | GPU | VRAM | 生成时间 (81帧) |
         | 
| 109 | 
            +
            |------|-----|------|----------------|
         | 
| 110 | 
            +
            | 最高质量 | A100 | 40GB | 15.5s/it |
         | 
| 111 | 
            +
            | 平衡模式 | RTX 4090 | 20GB | 32.8s/it |
         | 
| 112 | 
            +
            | 节能模式 | RTX 3060 | 5GB | 42.6s/it |
         | 
| 113 | 
            +
             | 
| 114 | 
            +
            ## 🛠 故障排除
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            ### 常见问题
         | 
| 117 | 
            +
             | 
| 118 | 
            +
            1. **CUDA内存不足**
         | 
| 119 | 
            +
               ```bash
         | 
| 120 | 
            +
               # 设置环境变量
         | 
| 121 | 
            +
               export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
         | 
| 122 | 
            +
               ```
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            2. **模型下载失败**
         | 
| 125 | 
            +
               ```bash
         | 
| 126 | 
            +
               # 使用镜像
         | 
| 127 | 
            +
               export HF_ENDPOINT=https://hf-mirror.com
         | 
| 128 | 
            +
               ```
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            3. **依赖冲突**
         | 
| 131 | 
            +
               ```bash
         | 
| 132 | 
            +
               # 使用虚拟环境
         | 
| 133 | 
            +
               python -m venv fantasy_talking
         | 
| 134 | 
            +
               source fantasy_talking/bin/activate  # Linux/Mac
         | 
| 135 | 
            +
               # fantasy_talking\Scripts\activate  # Windows
         | 
| 136 | 
            +
               ```
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            ### 日志和调试
         | 
| 139 | 
            +
             | 
| 140 | 
            +
            ```bash
         | 
| 141 | 
            +
            # 启用详细日志
         | 
| 142 | 
            +
            export PYTHONPATH=.
         | 
| 143 | 
            +
            export CUDA_LAUNCH_BLOCKING=1
         | 
| 144 | 
            +
            python app.py --debug
         | 
| 145 | 
            +
            ```
         | 
| 146 | 
            +
             | 
| 147 | 
            +
            ## 🌐 在线资源
         | 
| 148 | 
            +
             | 
| 149 | 
            +
            - **原始仓库**: https://github.com/Fantasy-AMAP/fantasy-talking
         | 
| 150 | 
            +
            - **论文**: https://arxiv.org/abs/2504.04842
         | 
| 151 | 
            +
            - **模型**: https://huggingface.co/acvlab/FantasyTalking
         | 
| 152 | 
            +
            - **在线演示**: https://huggingface.co/spaces/acvlab/FantasyTalking
         | 
| 153 | 
            +
             | 
| 154 | 
            +
            ## 📄 许可证
         | 
| 155 | 
            +
             | 
| 156 | 
            +
            本项目遵循Apache-2.0许可证。详见原始仓库。
         | 
| 157 | 
            +
             | 
| 158 | 
            +
            ## 🤝 贡献
         | 
| 159 | 
            +
             | 
| 160 | 
            +
            欢迎提交Issue和Pull Request到原始仓库:
         | 
| 161 | 
            +
            https://github.com/Fantasy-AMAP/fantasy-talking
         | 
    	
        README.md
    CHANGED
    
    | @@ -1,13 +1,94 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title:  | 
| 3 | 
            -
            emoji:  | 
| 4 | 
            -
            colorFrom:  | 
| 5 | 
            -
            colorTo:  | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 5.34.2
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
            -
             | 
|  | |
| 11 | 
             
            ---
         | 
| 12 |  | 
| 13 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: FantasyTalking Demo
         | 
| 3 | 
            +
            emoji: �
         | 
| 4 | 
            +
            colorFrom: blue
         | 
| 5 | 
            +
            colorTo: purple
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 5.34.2
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
            +
            license: apache-2.0
         | 
| 11 | 
            +
            short_description: Realistic Talking Portrait Generation via Coherent Motion Synthesis
         | 
| 12 | 
             
            ---
         | 
| 13 |  | 
| 14 | 
            +
            # FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            This is a Hugging Face Space demo for the FantasyTalking project, which generates realistic talking portraits from a single image and audio input.
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            ## 🔥 Features
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            - **Single Image Input**: Generate talking videos from just one portrait image
         | 
| 21 | 
            +
            - **Audio-driven Animation**: Synchronize lip movements with input audio
         | 
| 22 | 
            +
            - **High Quality Output**: 512x512 resolution with up to 81 frames
         | 
| 23 | 
            +
            - **Controllable Generation**: Adjust prompt and audio guidance scales
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            ## 📋 Requirements
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            Due to the large model size (~40GB+) and GPU memory requirements, this demo shows the interface but requires local deployment for full functionality.
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            ### System Requirements
         | 
| 30 | 
            +
            - NVIDIA GPU with at least 5GB VRAM (low memory mode)
         | 
| 31 | 
            +
            - 20GB+ VRAM recommended for optimal performance
         | 
| 32 | 
            +
            - 50GB+ storage space for models
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            ## 🚀 Local Deployment
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            To run FantasyTalking locally with full functionality:
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            ```bash
         | 
| 39 | 
            +
            # 1. Clone the repository
         | 
| 40 | 
            +
            git clone https://github.com/Fantasy-AMAP/fantasy-talking.git
         | 
| 41 | 
            +
            cd fantasy-talking
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            # 2. Install dependencies
         | 
| 44 | 
            +
            pip install -r requirements.txt
         | 
| 45 | 
            +
            pip install flash_attn  # Optional, for accelerated attention computation
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            # 3. Download models
         | 
| 48 | 
            +
            # Base model (~20GB)
         | 
| 49 | 
            +
            huggingface-cli download Wan-AI/Wan2.1-I2V-14B-720P --local-dir ./models/Wan2.1-I2V-14B-720P
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            # Audio encoder (~1GB)
         | 
| 52 | 
            +
            huggingface-cli download facebook/wav2vec2-base-960h --local-dir ./models/wav2vec2-base-960h
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            # FantasyTalking weights (~2GB)
         | 
| 55 | 
            +
            huggingface-cli download acvlab/FantasyTalking fantasytalking_model.ckpt --local-dir ./models
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            # 4. Run inference
         | 
| 58 | 
            +
            python infer.py --image_path ./assets/images/woman.png --audio_path ./assets/audios/woman.wav
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            # 5. Start web interface
         | 
| 61 | 
            +
            python app.py
         | 
| 62 | 
            +
            ```
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            ## 🎯 Performance
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            Model performance on single A100 (512x512, 81 frames):
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            | torch_dtype | num_persistent_param_in_dit | Speed | Required VRAM |
         | 
| 69 | 
            +
            |------------|----------------------------|-------|---------------|
         | 
| 70 | 
            +
            | torch.bfloat16 | None (unlimited) | 15.5s/it | 40G |
         | 
| 71 | 
            +
            | torch.bfloat16 | 7×10⁹ (7B) | 32.8s/it | 20G |
         | 
| 72 | 
            +
            | torch.bfloat16 | 0 | 42.6s/it | 5G |
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            ## 📖 Citation
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            ```bibtex
         | 
| 77 | 
            +
            @article{wang2025fantasytalking,
         | 
| 78 | 
            +
               title={FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis},
         | 
| 79 | 
            +
               author={Wang, Mengchao and Wang, Qiang and Jiang, Fan and Fan, Yaqi and Zhang, Yunpeng and Qi, Yonggang and Zhao, Kun and Xu, Mu},
         | 
| 80 | 
            +
               journal={arXiv preprint arXiv:2504.04842},
         | 
| 81 | 
            +
               year={2025}
         | 
| 82 | 
            +
            }
         | 
| 83 | 
            +
            ```
         | 
| 84 | 
            +
             | 
| 85 | 
            +
            ## 🔗 Links
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            - **Paper**: [arXiv:2504.04842](https://arxiv.org/abs/2504.04842)
         | 
| 88 | 
            +
            - **Code**: [GitHub Repository](https://github.com/Fantasy-AMAP/fantasy-talking)
         | 
| 89 | 
            +
            - **Models**: [Hugging Face](https://huggingface.co/acvlab/FantasyTalking)
         | 
| 90 | 
            +
            - **Project Page**: [FantasyTalking](https://fantasy-amap.github.io/fantasy-talking/)
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            ## 📄 License
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            This project is licensed under the Apache-2.0 License.
         | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,219 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright Alibaba Inc. All Rights Reserved.
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import argparse
         | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
            import subprocess
         | 
| 6 | 
            +
            from datetime import datetime
         | 
| 7 | 
            +
            from pathlib import Path
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            import gradio as gr
         | 
| 10 | 
            +
            import librosa
         | 
| 11 | 
            +
            import torch
         | 
| 12 | 
            +
            from PIL import Image
         | 
| 13 | 
            +
            from transformers import Wav2Vec2Model, Wav2Vec2Processor
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            # 由于在Hugging Face Space中运行,我们需要简化导入
         | 
| 16 | 
            +
            # from diffsynth import ModelManager, WanVideoPipeline
         | 
| 17 | 
            +
            # from model import FantasyTalkingAudioConditionModel
         | 
| 18 | 
            +
            # from utils import get_audio_features, resize_image_by_longest_edge, save_video
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            pipe, fantasytalking, wav2vec_processor, wav2vec = None, None, None, None
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            # 简化版的推理函数用于演示
         | 
| 23 | 
            +
            def generate_video(
         | 
| 24 | 
            +
                image_path,
         | 
| 25 | 
            +
                audio_path,
         | 
| 26 | 
            +
                prompt,
         | 
| 27 | 
            +
                prompt_cfg_scale,
         | 
| 28 | 
            +
                audio_cfg_scale,
         | 
| 29 | 
            +
                audio_weight,
         | 
| 30 | 
            +
                image_size,
         | 
| 31 | 
            +
                max_num_frames,
         | 
| 32 | 
            +
                inference_steps,
         | 
| 33 | 
            +
                seed,
         | 
| 34 | 
            +
            ):
         | 
| 35 | 
            +
                """
         | 
| 36 | 
            +
                简化版的视频生成函数,用于演示目的
         | 
| 37 | 
            +
                在实际部署中,需要加载完整的模型
         | 
| 38 | 
            +
                """
         | 
| 39 | 
            +
                # 创建输出目录
         | 
| 40 | 
            +
                output_dir = Path("./output")
         | 
| 41 | 
            +
                output_dir.mkdir(parents=True, exist_ok=True)
         | 
| 42 | 
            +
                
         | 
| 43 | 
            +
                # 这里应该是实际的推理代码
         | 
| 44 | 
            +
                # 目前返回一个提示信息
         | 
| 45 | 
            +
                return "模型正在准备中,请等待完整版本部署"
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            def create_args(
         | 
| 48 | 
            +
                image_path: str,
         | 
| 49 | 
            +
                audio_path: str,
         | 
| 50 | 
            +
                prompt: str,
         | 
| 51 | 
            +
                output_dir: str,
         | 
| 52 | 
            +
                audio_weight: float,
         | 
| 53 | 
            +
                prompt_cfg_scale: float,
         | 
| 54 | 
            +
                audio_cfg_scale: float,
         | 
| 55 | 
            +
                image_size: int,
         | 
| 56 | 
            +
                max_num_frames: int,
         | 
| 57 | 
            +
                inference_steps: int,
         | 
| 58 | 
            +
                seed: int,
         | 
| 59 | 
            +
            ) -> argparse.Namespace:
         | 
| 60 | 
            +
                """创建参数配置"""
         | 
| 61 | 
            +
                parser = argparse.ArgumentParser()
         | 
| 62 | 
            +
                parser.add_argument("--wan_model_dir", type=str, default="./models/Wan2.1-I2V-14B-720P")
         | 
| 63 | 
            +
                parser.add_argument("--fantasytalking_model_path", type=str, default="./models/fantasytalking_model.ckpt")
         | 
| 64 | 
            +
                parser.add_argument("--wav2vec_model_dir", type=str, default="./models/wav2vec2-base-960h")
         | 
| 65 | 
            +
                parser.add_argument("--image_path", type=str, default=image_path)
         | 
| 66 | 
            +
                parser.add_argument("--audio_path", type=str, default=audio_path)
         | 
| 67 | 
            +
                parser.add_argument("--prompt", type=str, default=prompt)
         | 
| 68 | 
            +
                parser.add_argument("--output_dir", type=str, default=output_dir)
         | 
| 69 | 
            +
                parser.add_argument("--image_size", type=int, default=image_size)
         | 
| 70 | 
            +
                parser.add_argument("--audio_scale", type=float, default=audio_weight)
         | 
| 71 | 
            +
                parser.add_argument("--prompt_cfg_scale", type=float, default=prompt_cfg_scale)
         | 
| 72 | 
            +
                parser.add_argument("--audio_cfg_scale", type=float, default=audio_cfg_scale)
         | 
| 73 | 
            +
                parser.add_argument("--max_num_frames", type=int, default=max_num_frames)
         | 
| 74 | 
            +
                parser.add_argument("--num_inference_steps", type=int, default=inference_steps)
         | 
| 75 | 
            +
                parser.add_argument("--seed", type=int, default=seed)
         | 
| 76 | 
            +
                parser.add_argument("--fps", type=int, default=24)
         | 
| 77 | 
            +
                parser.add_argument("--num_persistent_param_in_dit", type=int, default=7_000_000_000)
         | 
| 78 | 
            +
                
         | 
| 79 | 
            +
                return parser.parse_args([])
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            # 创建 Gradio 界面
         | 
| 82 | 
            +
            with gr.Blocks(title="FantasyTalking Video Generation") as demo:
         | 
| 83 | 
            +
                gr.Markdown(
         | 
| 84 | 
            +
                    """
         | 
| 85 | 
            +
                # FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis
         | 
| 86 | 
            +
                
         | 
| 87 | 
            +
                <div align="center">
         | 
| 88 | 
            +
                    <strong> Mengchao Wang1*  Qiang Wang1*  Fan Jiang1†
         | 
| 89 | 
            +
                    Yaqi Fan2    Yunpeng Zhang1,2   YongGang Qi2‡
         | 
| 90 | 
            +
                    Kun Zhao1.   Mu Xu1 </strong>
         | 
| 91 | 
            +
                </div>
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                <div align="center">
         | 
| 94 | 
            +
                    <strong>1AMAP,Alibaba Group   2Beijing University of Posts and Telecommunications</strong>
         | 
| 95 | 
            +
                </div>
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                <div style="display:flex;justify-content:center;column-gap:4px;">
         | 
| 98 | 
            +
                    <a href="https://github.com/Fantasy-AMAP/fantasy-talking">
         | 
| 99 | 
            +
                        <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
         | 
| 100 | 
            +
                    </a>
         | 
| 101 | 
            +
                    <a href="https://arxiv.org/abs/2504.04842">
         | 
| 102 | 
            +
                        <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
         | 
| 103 | 
            +
                    </a>
         | 
| 104 | 
            +
                </div>
         | 
| 105 | 
            +
                
         | 
| 106 | 
            +
                ## 注意
         | 
| 107 | 
            +
                此演示版本正在准备中。完整功能需要下载大量模型文件(约40GB+)。
         | 
| 108 | 
            +
                请参考 [GitHub仓库](https://github.com/Fantasy-AMAP/fantasy-talking) 获取完整安装和使用说明。
         | 
| 109 | 
            +
                """
         | 
| 110 | 
            +
                )
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                with gr.Row():
         | 
| 113 | 
            +
                    with gr.Column():
         | 
| 114 | 
            +
                        image_input = gr.Image(label="输入图像", type="filepath")
         | 
| 115 | 
            +
                        audio_input = gr.Audio(label="输入音频", type="filepath")
         | 
| 116 | 
            +
                        prompt_input = gr.Text(label="输入提示词", value="A woman is talking.")
         | 
| 117 | 
            +
                        
         | 
| 118 | 
            +
                        with gr.Row():
         | 
| 119 | 
            +
                            prompt_cfg_scale = gr.Slider(
         | 
| 120 | 
            +
                                minimum=1.0,
         | 
| 121 | 
            +
                                maximum=9.0,
         | 
| 122 | 
            +
                                value=5.0,
         | 
| 123 | 
            +
                                step=0.5,
         | 
| 124 | 
            +
                                label="提示词CFG比例",
         | 
| 125 | 
            +
                            )
         | 
| 126 | 
            +
                            audio_cfg_scale = gr.Slider(
         | 
| 127 | 
            +
                                minimum=1.0,
         | 
| 128 | 
            +
                                maximum=9.0,
         | 
| 129 | 
            +
                                value=5.0,
         | 
| 130 | 
            +
                                step=0.5,
         | 
| 131 | 
            +
                                label="音频CFG比例",
         | 
| 132 | 
            +
                            )
         | 
| 133 | 
            +
                            audio_weight = gr.Slider(
         | 
| 134 | 
            +
                                minimum=0.1,
         | 
| 135 | 
            +
                                maximum=3.0,
         | 
| 136 | 
            +
                                value=1.0,
         | 
| 137 | 
            +
                                step=0.1,
         | 
| 138 | 
            +
                                label="音频权重",
         | 
| 139 | 
            +
                            )
         | 
| 140 | 
            +
                        
         | 
| 141 | 
            +
                        with gr.Row():
         | 
| 142 | 
            +
                            image_size = gr.Number(
         | 
| 143 | 
            +
                                value=512, label="宽度/高度最大尺寸", precision=0
         | 
| 144 | 
            +
                            )
         | 
| 145 | 
            +
                            max_num_frames = gr.Number(
         | 
| 146 | 
            +
                                value=81, label="最大帧数", precision=0
         | 
| 147 | 
            +
                            )
         | 
| 148 | 
            +
                            inference_steps = gr.Slider(
         | 
| 149 | 
            +
                                minimum=1, maximum=50, value=20, step=1, label="推理步数"
         | 
| 150 | 
            +
                            )
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                        with gr.Row():
         | 
| 153 | 
            +
                            seed = gr.Number(value=1247, label="随机种子", precision=0)
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                        process_btn = gr.Button("生成视频")
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                    with gr.Column():
         | 
| 158 | 
            +
                        video_output = gr.Video(label="输出视频")
         | 
| 159 | 
            +
                        
         | 
| 160 | 
            +
                        gr.Markdown(
         | 
| 161 | 
            +
                            """
         | 
| 162 | 
            +
                            ## 使用说明
         | 
| 163 | 
            +
                            
         | 
| 164 | 
            +
                            1. **上传图像**: 选择一张人物肖像图片
         | 
| 165 | 
            +
                            2. **上传音频**: 选择对应的音频文件
         | 
| 166 | 
            +
                            3. **设置参数**: 调整各种生成参数
         | 
| 167 | 
            +
                            4. **生成视频**: 点击按钮开始生成
         | 
| 168 | 
            +
                            
         | 
| 169 | 
            +
                            ## 模型要求
         | 
| 170 | 
            +
                            
         | 
| 171 | 
            +
                            - **基础模型**: Wan2.1-I2V-14B-720P (~20GB)
         | 
| 172 | 
            +
                            - **音频编码器**: Wav2Vec2 (~1GB)  
         | 
| 173 | 
            +
                            - **FantasyTalking模型**: 专用权重文件 (~2GB)
         | 
| 174 | 
            +
                            - **显存要求**: 至少5GB VRAM(设置为低内存模式)
         | 
| 175 | 
            +
                            
         | 
| 176 | 
            +
                            ## 本地部署
         | 
| 177 | 
            +
                            
         | 
| 178 | 
            +
                            ```bash
         | 
| 179 | 
            +
                            # 1. 克隆仓库
         | 
| 180 | 
            +
                            git clone https://github.com/Fantasy-AMAP/fantasy-talking.git
         | 
| 181 | 
            +
                            cd fantasy-talking
         | 
| 182 | 
            +
                            
         | 
| 183 | 
            +
                            # 2. 安装依赖
         | 
| 184 | 
            +
                            pip install -r requirements.txt
         | 
| 185 | 
            +
                            pip install flash_attn  # 可选,加速注意力计算
         | 
| 186 | 
            +
                            
         | 
| 187 | 
            +
                            # 3. 下载模型
         | 
| 188 | 
            +
                            huggingface-cli download Wan-AI/Wan2.1-I2V-14B-720P --local-dir ./models/Wan2.1-I2V-14B-720P
         | 
| 189 | 
            +
                            huggingface-cli download facebook/wav2vec2-base-960h --local-dir ./models/wav2vec2-base-960h
         | 
| 190 | 
            +
                            huggingface-cli download acvlab/FantasyTalking fantasytalking_model.ckpt --local-dir ./models
         | 
| 191 | 
            +
                            
         | 
| 192 | 
            +
                            # 4. 运行推理
         | 
| 193 | 
            +
                            python infer.py --image_path ./assets/images/woman.png --audio_path ./assets/audios/woman.wav
         | 
| 194 | 
            +
                            
         | 
| 195 | 
            +
                            # 5. 启动Web界面
         | 
| 196 | 
            +
                            python app.py
         | 
| 197 | 
            +
                            ```
         | 
| 198 | 
            +
                            """
         | 
| 199 | 
            +
                        )
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                process_btn.click(
         | 
| 202 | 
            +
                    fn=generate_video,
         | 
| 203 | 
            +
                    inputs=[
         | 
| 204 | 
            +
                        image_input,
         | 
| 205 | 
            +
                        audio_input,
         | 
| 206 | 
            +
                        prompt_input,
         | 
| 207 | 
            +
                        prompt_cfg_scale,
         | 
| 208 | 
            +
                        audio_cfg_scale,
         | 
| 209 | 
            +
                        audio_weight,
         | 
| 210 | 
            +
                        image_size,
         | 
| 211 | 
            +
                        max_num_frames,
         | 
| 212 | 
            +
                        inference_steps,
         | 
| 213 | 
            +
                        seed,
         | 
| 214 | 
            +
                    ],
         | 
| 215 | 
            +
                    outputs=video_output,
         | 
| 216 | 
            +
                )
         | 
| 217 | 
            +
             | 
| 218 | 
            +
            if __name__ == "__main__":
         | 
| 219 | 
            +
                demo.launch(inbrowser=True, share=True)
         | 
    	
        assets/README.md
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # 示例图像和音频文件
         | 
| 2 | 
            +
            # 在实际部署中,请将示例图像和音频文件放入对应目录
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # assets/images/woman.png - 示例女性肖像图片
         | 
| 5 | 
            +
            # assets/audios/woman.wav - 示例音频文件
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            # 如果没有示例文件,用户可以通过界面上传自己的图像和音频
         | 
    	
        deploy.py
    ADDED
    
    | @@ -0,0 +1,129 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # FantasyTalking部署脚本
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import os
         | 
| 4 | 
            +
            import subprocess
         | 
| 5 | 
            +
            import sys
         | 
| 6 | 
            +
            from pathlib import Path
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            def check_gpu():
         | 
| 10 | 
            +
                """检查GPU可用性"""
         | 
| 11 | 
            +
                try:
         | 
| 12 | 
            +
                    import torch
         | 
| 13 | 
            +
                    if torch.cuda.is_available():
         | 
| 14 | 
            +
                        gpu_count = torch.cuda.device_count()
         | 
| 15 | 
            +
                        gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
         | 
| 16 | 
            +
                        gpu_memory = torch.cuda.get_device_properties(0).total_memory // (1024**3) if gpu_count > 0 else 0
         | 
| 17 | 
            +
                        
         | 
| 18 | 
            +
                        print(f"✅ GPU可用: {gpu_name}")
         | 
| 19 | 
            +
                        print(f"✅ GPU内存: {gpu_memory}GB")
         | 
| 20 | 
            +
                        
         | 
| 21 | 
            +
                        if gpu_memory < 5:
         | 
| 22 | 
            +
                            print("⚠️  警告: GPU内存可能不足,建议至少5GB VRAM")
         | 
| 23 | 
            +
                        
         | 
| 24 | 
            +
                        return True
         | 
| 25 | 
            +
                    else:
         | 
| 26 | 
            +
                        print("❌ 未检测到可用的GPU")
         | 
| 27 | 
            +
                        return False
         | 
| 28 | 
            +
                except ImportError:
         | 
| 29 | 
            +
                    print("❌ PyTorch未安装")
         | 
| 30 | 
            +
                    return False
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 33 | 
            +
            def install_dependencies():
         | 
| 34 | 
            +
                """安装依赖"""
         | 
| 35 | 
            +
                print("📦 安装依赖包...")
         | 
| 36 | 
            +
                subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
         | 
| 37 | 
            +
                print("✅ 依赖安装完成")
         | 
| 38 | 
            +
             | 
| 39 | 
            +
             | 
| 40 | 
            +
            def download_models():
         | 
| 41 | 
            +
                """下载模型(需要huggingface-cli)"""
         | 
| 42 | 
            +
                print("📥 开始下载模型...")
         | 
| 43 | 
            +
                
         | 
| 44 | 
            +
                models_dir = Path("./models")
         | 
| 45 | 
            +
                models_dir.mkdir(exist_ok=True)
         | 
| 46 | 
            +
                
         | 
| 47 | 
            +
                # 检查huggingface-cli
         | 
| 48 | 
            +
                try:
         | 
| 49 | 
            +
                    subprocess.check_call(["huggingface-cli", "--help"], stdout=subprocess.DEVNULL)
         | 
| 50 | 
            +
                except (subprocess.CalledProcessError, FileNotFoundError):
         | 
| 51 | 
            +
                    print("安装huggingface-hub[cli]...")
         | 
| 52 | 
            +
                    subprocess.check_call([sys.executable, "-m", "pip", "install", "huggingface_hub[cli]"])
         | 
| 53 | 
            +
                
         | 
| 54 | 
            +
                # 下载模型
         | 
| 55 | 
            +
                models_to_download = [
         | 
| 56 | 
            +
                    ("Wan-AI/Wan2.1-I2V-14B-720P", "./models/Wan2.1-I2V-14B-720P"),
         | 
| 57 | 
            +
                    ("facebook/wav2vec2-base-960h", "./models/wav2vec2-base-960h"),
         | 
| 58 | 
            +
                ]
         | 
| 59 | 
            +
                
         | 
| 60 | 
            +
                for model_id, local_dir in models_to_download:
         | 
| 61 | 
            +
                    print(f"下载 {model_id}...")
         | 
| 62 | 
            +
                    subprocess.check_call([
         | 
| 63 | 
            +
                        "huggingface-cli", "download", model_id, 
         | 
| 64 | 
            +
                        "--local-dir", local_dir
         | 
| 65 | 
            +
                    ])
         | 
| 66 | 
            +
                
         | 
| 67 | 
            +
                # 下载FantasyTalking权重
         | 
| 68 | 
            +
                print("下载FantasyTalking权重...")
         | 
| 69 | 
            +
                subprocess.check_call([
         | 
| 70 | 
            +
                    "huggingface-cli", "download", "acvlab/FantasyTalking", 
         | 
| 71 | 
            +
                    "fantasytalking_model.ckpt", "--local-dir", "./models"
         | 
| 72 | 
            +
                ])
         | 
| 73 | 
            +
                
         | 
| 74 | 
            +
                print("✅ 模型下载完成")
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            def check_model_files():
         | 
| 78 | 
            +
                """检查模型文件"""
         | 
| 79 | 
            +
                required_files = [
         | 
| 80 | 
            +
                    "./models/Wan2.1-I2V-14B-720P",
         | 
| 81 | 
            +
                    "./models/wav2vec2-base-960h", 
         | 
| 82 | 
            +
                    "./models/fantasytalking_model.ckpt"
         | 
| 83 | 
            +
                ]
         | 
| 84 | 
            +
                
         | 
| 85 | 
            +
                missing_files = []
         | 
| 86 | 
            +
                for file_path in required_files:
         | 
| 87 | 
            +
                    if not os.path.exists(file_path):
         | 
| 88 | 
            +
                        missing_files.append(file_path)
         | 
| 89 | 
            +
                
         | 
| 90 | 
            +
                if missing_files:
         | 
| 91 | 
            +
                    print("❌ 缺少以下模型文件:")
         | 
| 92 | 
            +
                    for file in missing_files:
         | 
| 93 | 
            +
                        print(f"   - {file}")
         | 
| 94 | 
            +
                    return False
         | 
| 95 | 
            +
                else:
         | 
| 96 | 
            +
                    print("✅ 所有模型文件已就绪")
         | 
| 97 | 
            +
                    return True
         | 
| 98 | 
            +
             | 
| 99 | 
            +
             | 
| 100 | 
            +
            def start_app():
         | 
| 101 | 
            +
                """启动应用"""
         | 
| 102 | 
            +
                print("🚀 启动FantasyTalking应用...")
         | 
| 103 | 
            +
                subprocess.check_call([sys.executable, "app.py"])
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
            def main():
         | 
| 107 | 
            +
                """主函数"""
         | 
| 108 | 
            +
                print("🎬 FantasyTalking 自动部署脚本")
         | 
| 109 | 
            +
                print("=" * 50)
         | 
| 110 | 
            +
                
         | 
| 111 | 
            +
                # 检查GPU
         | 
| 112 | 
            +
                if not check_gpu():
         | 
| 113 | 
            +
                    print("⚠️  继续使用CPU模式(速度会很慢)")
         | 
| 114 | 
            +
                
         | 
| 115 | 
            +
                # 安装依赖
         | 
| 116 | 
            +
                install_dependencies()
         | 
| 117 | 
            +
                
         | 
| 118 | 
            +
                # 检查模型文件
         | 
| 119 | 
            +
                if not check_model_files():
         | 
| 120 | 
            +
                    print("📥 需要下载模型文件...")
         | 
| 121 | 
            +
                    download_models()
         | 
| 122 | 
            +
                
         | 
| 123 | 
            +
                print("✅ 部署完成!")
         | 
| 124 | 
            +
                print("\n启动应用...")
         | 
| 125 | 
            +
                start_app()
         | 
| 126 | 
            +
             | 
| 127 | 
            +
             | 
| 128 | 
            +
            if __name__ == "__main__":
         | 
| 129 | 
            +
                main()
         | 
    	
        infer.py
    ADDED
    
    | @@ -0,0 +1,168 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright Alibaba Inc. All Rights Reserved.
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import argparse
         | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
            import subprocess
         | 
| 6 | 
            +
            from datetime import datetime
         | 
| 7 | 
            +
            from pathlib import Path
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            import cv2
         | 
| 10 | 
            +
            import librosa
         | 
| 11 | 
            +
            import torch
         | 
| 12 | 
            +
            from PIL import Image
         | 
| 13 | 
            +
            from transformers import Wav2Vec2Model, Wav2Vec2Processor
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            # 注意:以下导入在完整版本中需要
         | 
| 16 | 
            +
            # from diffsynth import ModelManager, WanVideoPipeline
         | 
| 17 | 
            +
            from model import FantasyTalkingAudioConditionModel
         | 
| 18 | 
            +
            from utils import get_audio_features, resize_image_by_longest_edge, save_video
         | 
| 19 | 
            +
             | 
| 20 | 
            +
             | 
| 21 | 
            +
            def parse_args():
         | 
| 22 | 
            +
                """解析命令行参数"""
         | 
| 23 | 
            +
                parser = argparse.ArgumentParser()
         | 
| 24 | 
            +
                parser.add_argument(
         | 
| 25 | 
            +
                    "--wan_model_dir",
         | 
| 26 | 
            +
                    type=str,
         | 
| 27 | 
            +
                    default="./models/Wan2.1-I2V-14B-720P",
         | 
| 28 | 
            +
                    help="Wan I2V 14B模型目录"
         | 
| 29 | 
            +
                )
         | 
| 30 | 
            +
                parser.add_argument(
         | 
| 31 | 
            +
                    "--fantasytalking_model_path",
         | 
| 32 | 
            +
                    type=str,
         | 
| 33 | 
            +
                    default="./models/fantasytalking_model.ckpt",
         | 
| 34 | 
            +
                    help="FantasyTalking模型路径"
         | 
| 35 | 
            +
                )
         | 
| 36 | 
            +
                parser.add_argument(
         | 
| 37 | 
            +
                    "--wav2vec_model_dir",
         | 
| 38 | 
            +
                    type=str,
         | 
| 39 | 
            +
                    default="./models/wav2vec2-base-960h",
         | 
| 40 | 
            +
                    help="Wav2Vec模型目录"
         | 
| 41 | 
            +
                )
         | 
| 42 | 
            +
                parser.add_argument(
         | 
| 43 | 
            +
                    "--image_path",
         | 
| 44 | 
            +
                    type=str,
         | 
| 45 | 
            +
                    default="./assets/images/woman.png",
         | 
| 46 | 
            +
                    help="输入图像路径"
         | 
| 47 | 
            +
                )
         | 
| 48 | 
            +
                parser.add_argument(
         | 
| 49 | 
            +
                    "--audio_path",
         | 
| 50 | 
            +
                    type=str,
         | 
| 51 | 
            +
                    default="./assets/audios/woman.wav",
         | 
| 52 | 
            +
                    help="输入音频路径"
         | 
| 53 | 
            +
                )
         | 
| 54 | 
            +
                parser.add_argument(
         | 
| 55 | 
            +
                    "--prompt",
         | 
| 56 | 
            +
                    type=str,
         | 
| 57 | 
            +
                    default="A woman is talking.",
         | 
| 58 | 
            +
                    help="提示词"
         | 
| 59 | 
            +
                )
         | 
| 60 | 
            +
                parser.add_argument(
         | 
| 61 | 
            +
                    "--output_dir",
         | 
| 62 | 
            +
                    type=str,
         | 
| 63 | 
            +
                    default="./output",
         | 
| 64 | 
            +
                    help="输出目录"
         | 
| 65 | 
            +
                )
         | 
| 66 | 
            +
                parser.add_argument(
         | 
| 67 | 
            +
                    "--image_size",
         | 
| 68 | 
            +
                    type=int,
         | 
| 69 | 
            +
                    default=512,
         | 
| 70 | 
            +
                    help="图像尺寸"
         | 
| 71 | 
            +
                )
         | 
| 72 | 
            +
                parser.add_argument(
         | 
| 73 | 
            +
                    "--audio_scale",
         | 
| 74 | 
            +
                    type=float,
         | 
| 75 | 
            +
                    default=1.0,
         | 
| 76 | 
            +
                    help="音频条件注入权重"
         | 
| 77 | 
            +
                )
         | 
| 78 | 
            +
                parser.add_argument(
         | 
| 79 | 
            +
                    "--prompt_cfg_scale",
         | 
| 80 | 
            +
                    type=float,
         | 
| 81 | 
            +
                    default=5.0,
         | 
| 82 | 
            +
                    help="提示词CFG比例"
         | 
| 83 | 
            +
                )
         | 
| 84 | 
            +
                parser.add_argument(
         | 
| 85 | 
            +
                    "--audio_cfg_scale",
         | 
| 86 | 
            +
                    type=float,
         | 
| 87 | 
            +
                    default=5.0,
         | 
| 88 | 
            +
                    help="音频CFG比例"
         | 
| 89 | 
            +
                )
         | 
| 90 | 
            +
                parser.add_argument(
         | 
| 91 | 
            +
                    "--max_num_frames",
         | 
| 92 | 
            +
                    type=int,
         | 
| 93 | 
            +
                    default=81,
         | 
| 94 | 
            +
                    help="最大帧数"
         | 
| 95 | 
            +
                )
         | 
| 96 | 
            +
                parser.add_argument(
         | 
| 97 | 
            +
                    "--num_inference_steps",
         | 
| 98 | 
            +
                    type=int,
         | 
| 99 | 
            +
                    default=30,
         | 
| 100 | 
            +
                    help="推理步数"
         | 
| 101 | 
            +
                )
         | 
| 102 | 
            +
                parser.add_argument(
         | 
| 103 | 
            +
                    "--seed",
         | 
| 104 | 
            +
                    type=int,
         | 
| 105 | 
            +
                    default=1247,
         | 
| 106 | 
            +
                    help="随机种子"
         | 
| 107 | 
            +
                )
         | 
| 108 | 
            +
                parser.add_argument(
         | 
| 109 | 
            +
                    "--fps",
         | 
| 110 | 
            +
                    type=int,
         | 
| 111 | 
            +
                    default=24,
         | 
| 112 | 
            +
                    help="帧率"
         | 
| 113 | 
            +
                )
         | 
| 114 | 
            +
                parser.add_argument(
         | 
| 115 | 
            +
                    "--num_persistent_param_in_dit",
         | 
| 116 | 
            +
                    type=int,
         | 
| 117 | 
            +
                    default=7_000_000_000,
         | 
| 118 | 
            +
                    help="DiT中持久参数数量,用于VRAM管理"
         | 
| 119 | 
            +
                )
         | 
| 120 | 
            +
                
         | 
| 121 | 
            +
                return parser.parse_args()
         | 
| 122 | 
            +
             | 
| 123 | 
            +
             | 
| 124 | 
            +
            def load_models(args):
         | 
| 125 | 
            +
                """加载模型"""
         | 
| 126 | 
            +
                print("正在加载模型...")
         | 
| 127 | 
            +
                
         | 
| 128 | 
            +
                # 在完整版本中,这里会加载实际的模型
         | 
| 129 | 
            +
                # model_manager = ModelManager(device="cpu")
         | 
| 130 | 
            +
                # model_manager.load_models([...])
         | 
| 131 | 
            +
                # pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda")
         | 
| 132 | 
            +
                
         | 
| 133 | 
            +
                # 模拟模型加载
         | 
| 134 | 
            +
                pipe = None
         | 
| 135 | 
            +
                fantasytalking = None
         | 
| 136 | 
            +
                wav2vec_processor = None
         | 
| 137 | 
            +
                wav2vec = None
         | 
| 138 | 
            +
                
         | 
| 139 | 
            +
                print("模型加载完成(演示模式)")
         | 
| 140 | 
            +
                return pipe, fantasytalking, wav2vec_processor, wav2vec
         | 
| 141 | 
            +
             | 
| 142 | 
            +
             | 
| 143 | 
            +
            def main(args, pipe, fantasytalking, wav2vec_processor, wav2vec):
         | 
| 144 | 
            +
                """主推理函数"""
         | 
| 145 | 
            +
                print(f"输入图像: {args.image_path}")
         | 
| 146 | 
            +
                print(f"输入音频: {args.audio_path}")
         | 
| 147 | 
            +
                print(f"提示词: {args.prompt}")
         | 
| 148 | 
            +
                
         | 
| 149 | 
            +
                # 创建输出目录
         | 
| 150 | 
            +
                os.makedirs(args.output_dir, exist_ok=True)
         | 
| 151 | 
            +
                
         | 
| 152 | 
            +
                # 在完整版本中,这里会执行实际的推理
         | 
| 153 | 
            +
                print("开始推理...")
         | 
| 154 | 
            +
                
         | 
| 155 | 
            +
                # 模拟输出路径
         | 
| 156 | 
            +
                current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
         | 
| 157 | 
            +
                output_path = f"{args.output_dir}/output_{current_time}.mp4"
         | 
| 158 | 
            +
                
         | 
| 159 | 
            +
                print(f"输出将保存到: {output_path}")
         | 
| 160 | 
            +
                print("推理完成(演示模式)")
         | 
| 161 | 
            +
                
         | 
| 162 | 
            +
                return output_path
         | 
| 163 | 
            +
             | 
| 164 | 
            +
             | 
| 165 | 
            +
            if __name__ == "__main__":
         | 
| 166 | 
            +
                args = parse_args()
         | 
| 167 | 
            +
                pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args)
         | 
| 168 | 
            +
                main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)
         | 
    	
        model.py
    ADDED
    
    | @@ -0,0 +1,99 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright Alibaba Inc. All Rights Reserved.
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import os
         | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            import torch.nn as nn
         | 
| 6 | 
            +
            import torch.nn.functional as F
         | 
| 7 | 
            +
            from safetensors import safe_open
         | 
| 8 | 
            +
             | 
| 9 | 
            +
             | 
| 10 | 
            +
            class AudioProjModel(nn.Module):
         | 
| 11 | 
            +
                """音频投影模型"""
         | 
| 12 | 
            +
                def __init__(self, audio_dim, proj_dim):
         | 
| 13 | 
            +
                    super().__init__()
         | 
| 14 | 
            +
                    self.audio_dim = audio_dim
         | 
| 15 | 
            +
                    self.proj_dim = proj_dim
         | 
| 16 | 
            +
                    
         | 
| 17 | 
            +
                    self.projection = nn.Sequential(
         | 
| 18 | 
            +
                        nn.Linear(audio_dim, proj_dim * 2),
         | 
| 19 | 
            +
                        nn.ReLU(),
         | 
| 20 | 
            +
                        nn.Linear(proj_dim * 2, proj_dim),
         | 
| 21 | 
            +
                    )
         | 
| 22 | 
            +
                
         | 
| 23 | 
            +
                def forward(self, audio_features):
         | 
| 24 | 
            +
                    return self.projection(audio_features)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
             | 
| 27 | 
            +
            class WanCrossAttentionProcessor(nn.Module):
         | 
| 28 | 
            +
                """Wan模型的交叉注意力处理器"""
         | 
| 29 | 
            +
                def __init__(self, hidden_size, cross_attention_dim, audio_proj_dim):
         | 
| 30 | 
            +
                    super().__init__()
         | 
| 31 | 
            +
                    self.hidden_size = hidden_size
         | 
| 32 | 
            +
                    self.cross_attention_dim = cross_attention_dim
         | 
| 33 | 
            +
                    self.audio_proj_dim = audio_proj_dim
         | 
| 34 | 
            +
                    
         | 
| 35 | 
            +
                    # 音频条件的查询、键、值投影层
         | 
| 36 | 
            +
                    self.to_q_audio = nn.Linear(hidden_size, hidden_size, bias=False)
         | 
| 37 | 
            +
                    self.to_k_audio = nn.Linear(audio_proj_dim, hidden_size, bias=False)
         | 
| 38 | 
            +
                    self.to_v_audio = nn.Linear(audio_proj_dim, hidden_size, bias=False)
         | 
| 39 | 
            +
                    
         | 
| 40 | 
            +
                    self.scale = hidden_size ** -0.5
         | 
| 41 | 
            +
                
         | 
| 42 | 
            +
                def forward(self, hidden_states, audio_features=None, **kwargs):
         | 
| 43 | 
            +
                    if audio_features is None:
         | 
| 44 | 
            +
                        return hidden_states
         | 
| 45 | 
            +
                    
         | 
| 46 | 
            +
                    batch_size, seq_len, _ = hidden_states.shape
         | 
| 47 | 
            +
                    
         | 
| 48 | 
            +
                    # 计算查询、键、值
         | 
| 49 | 
            +
                    query = self.to_q_audio(hidden_states)
         | 
| 50 | 
            +
                    key = self.to_k_audio(audio_features)
         | 
| 51 | 
            +
                    value = self.to_v_audio(audio_features)
         | 
| 52 | 
            +
                    
         | 
| 53 | 
            +
                    # 计算注意力权重
         | 
| 54 | 
            +
                    attention_scores = torch.matmul(query, key.transpose(-2, -1)) * self.scale
         | 
| 55 | 
            +
                    attention_probs = F.softmax(attention_scores, dim=-1)
         | 
| 56 | 
            +
                    
         | 
| 57 | 
            +
                    # 应用注意力权重
         | 
| 58 | 
            +
                    attention_output = torch.matmul(attention_probs, value)
         | 
| 59 | 
            +
                    
         | 
| 60 | 
            +
                    return hidden_states + attention_output
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
            class FantasyTalkingAudioConditionModel(nn.Module):
         | 
| 64 | 
            +
                """FantasyTalking音频条件模型"""
         | 
| 65 | 
            +
                def __init__(self, base_model, audio_dim, proj_dim):
         | 
| 66 | 
            +
                    super().__init__()
         | 
| 67 | 
            +
                    self.base_model = base_model
         | 
| 68 | 
            +
                    self.audio_dim = audio_dim
         | 
| 69 | 
            +
                    self.proj_dim = proj_dim
         | 
| 70 | 
            +
                    
         | 
| 71 | 
            +
                    # 音频投影层
         | 
| 72 | 
            +
                    self.audio_proj = AudioProjModel(audio_dim, proj_dim)
         | 
| 73 | 
            +
                    
         | 
| 74 | 
            +
                    # 存储原始的注意力处理器
         | 
| 75 | 
            +
                    self.original_processors = {}
         | 
| 76 | 
            +
                    
         | 
| 77 | 
            +
                def load_audio_processor(self, checkpoint_path, base_model):
         | 
| 78 | 
            +
                    """加载音频处理器权重"""
         | 
| 79 | 
            +
                    if os.path.exists(checkpoint_path):
         | 
| 80 | 
            +
                        print(f"加载FantasyTalking权重: {checkpoint_path}")
         | 
| 81 | 
            +
                        # 这里应该加载实际的权重文件
         | 
| 82 | 
            +
                        # state_dict = torch.load(checkpoint_path, map_location="cpu")
         | 
| 83 | 
            +
                        # self.load_state_dict(state_dict, strict=False)
         | 
| 84 | 
            +
                    else:
         | 
| 85 | 
            +
                        print(f"权重文件不存在: {checkpoint_path}")
         | 
| 86 | 
            +
                
         | 
| 87 | 
            +
                def enable_audio_condition(self):
         | 
| 88 | 
            +
                    """启用音频条件"""
         | 
| 89 | 
            +
                    # 这里应该替换base_model中的注意力处理器
         | 
| 90 | 
            +
                    pass
         | 
| 91 | 
            +
                
         | 
| 92 | 
            +
                def disable_audio_condition(self):
         | 
| 93 | 
            +
                    """禁用音频条件"""
         | 
| 94 | 
            +
                    # 这里应该恢复原始的注意力处理器
         | 
| 95 | 
            +
                    pass
         | 
| 96 | 
            +
                
         | 
| 97 | 
            +
                def forward(self, audio_features):
         | 
| 98 | 
            +
                    """前向传播"""
         | 
| 99 | 
            +
                    return self.audio_proj(audio_features)
         | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,15 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            torch>=2.0.0
         | 
| 2 | 
            +
            torchvision
         | 
| 3 | 
            +
            transformers==4.46.2
         | 
| 4 | 
            +
            gradio==5.34.2
         | 
| 5 | 
            +
            spaces
         | 
| 6 | 
            +
            imageio
         | 
| 7 | 
            +
            imageio[ffmpeg]
         | 
| 8 | 
            +
            safetensors
         | 
| 9 | 
            +
            einops
         | 
| 10 | 
            +
            sentencepiece
         | 
| 11 | 
            +
            protobuf
         | 
| 12 | 
            +
            librosa
         | 
| 13 | 
            +
            numpy
         | 
| 14 | 
            +
            pillow
         | 
| 15 | 
            +
            tqdm
         | 
    	
        utils.py
    ADDED
    
    | @@ -0,0 +1,70 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright Alibaba Inc. All Rights Reserved.
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import imageio
         | 
| 4 | 
            +
            import librosa
         | 
| 5 | 
            +
            import numpy as np
         | 
| 6 | 
            +
            import torch
         | 
| 7 | 
            +
            from PIL import Image
         | 
| 8 | 
            +
            from tqdm import tqdm
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            def resize_image_by_longest_edge(image_path, target_size):
         | 
| 12 | 
            +
                """根据最长边调整图像大小"""
         | 
| 13 | 
            +
                image = Image.open(image_path)
         | 
| 14 | 
            +
                width, height = image.size
         | 
| 15 | 
            +
                
         | 
| 16 | 
            +
                if max(width, height) <= target_size:
         | 
| 17 | 
            +
                    return image
         | 
| 18 | 
            +
                
         | 
| 19 | 
            +
                if width > height:
         | 
| 20 | 
            +
                    new_width = target_size
         | 
| 21 | 
            +
                    new_height = int(height * target_size / width)
         | 
| 22 | 
            +
                else:
         | 
| 23 | 
            +
                    new_height = target_size
         | 
| 24 | 
            +
                    new_width = int(width * target_size / height)
         | 
| 25 | 
            +
                
         | 
| 26 | 
            +
                return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
         | 
| 27 | 
            +
             | 
| 28 | 
            +
             | 
| 29 | 
            +
            def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
         | 
| 30 | 
            +
                """保存视频帧为MP4文件"""
         | 
| 31 | 
            +
                if isinstance(frames, torch.Tensor):
         | 
| 32 | 
            +
                    frames = frames.cpu().numpy()
         | 
| 33 | 
            +
                
         | 
| 34 | 
            +
                # 确保帧数据在正确的范围内
         | 
| 35 | 
            +
                if frames.max() <= 1.0:
         | 
| 36 | 
            +
                    frames = (frames * 255).astype(np.uint8)
         | 
| 37 | 
            +
                else:
         | 
| 38 | 
            +
                    frames = frames.astype(np.uint8)
         | 
| 39 | 
            +
                
         | 
| 40 | 
            +
                # 使用imageio保存视频
         | 
| 41 | 
            +
                writer = imageio.get_writer(save_path, fps=fps, quality=quality)
         | 
| 42 | 
            +
                for frame in tqdm(frames, desc="保存视频"):
         | 
| 43 | 
            +
                    writer.append_data(frame)
         | 
| 44 | 
            +
                writer.close()
         | 
| 45 | 
            +
             | 
| 46 | 
            +
             | 
| 47 | 
            +
            def get_audio_features(wav2vec, audio_processor, audio_path, fps, num_frames):
         | 
| 48 | 
            +
                """提取音频特征"""
         | 
| 49 | 
            +
                sr = 16000
         | 
| 50 | 
            +
                audio_input, sample_rate = librosa.load(audio_path, sr=sr)  # 采样率为 16kHz
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                start_time = 0
         | 
| 53 | 
            +
                end_time = num_frames / fps
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                start_sample = int(start_time * sr)
         | 
| 56 | 
            +
                end_sample = int(end_time * sr)
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                try:
         | 
| 59 | 
            +
                    audio_segment = audio_input[start_sample:end_sample]
         | 
| 60 | 
            +
                except:
         | 
| 61 | 
            +
                    audio_segment = audio_input
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                input_values = audio_processor(
         | 
| 64 | 
            +
                    audio_segment, sampling_rate=sample_rate, return_tensors="pt"
         | 
| 65 | 
            +
                ).input_values.to("cuda")
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                with torch.no_grad():
         | 
| 68 | 
            +
                    fea = wav2vec(input_values).last_hidden_state
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                return fea
         | 
