Spaces:
Runtime error
Runtime error
File size: 12,968 Bytes
cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 c5c8aa3 a2a9a31 cf2f35c c5c8aa3 a2a9a31 c5c8aa3 a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 c5c8aa3 a2a9a31 cf2f35c c5c8aa3 a2a9a31 c5c8aa3 a2a9a31 cf2f35c a2a9a31 c5c8aa3 cf2f35c c5c8aa3 a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 c5c8aa3 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c c5c8aa3 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c a2a9a31 c5c8aa3 a2a9a31 c5c8aa3 a2a9a31 c5c8aa3 cf2f35c a2a9a31 cf2f35c a2a9a31 cf2f35c c5c8aa3 a2a9a31 c5c8aa3 a2a9a31 c5c8aa3 a2a9a31 c5c8aa3 a2a9a31 c5c8aa3 a2a9a31 cf2f35c a2a9a31 c5c8aa3 cf2f35c a2a9a31 cf2f35c a2a9a31 c5c8aa3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 |
import torch
import psutil
import argparse
import os
from diffusers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import load_image
from transformers import AutoTokenizer, Wav2Vec2Model, Wav2Vec2Processor
from omegaconf import OmegaConf
from wan.models.cache_utils import get_teacache_coefficients
from wan.models.wan_fantasy_transformer3d_1B import WanTransformer3DFantasyModel
from wan.models.wan_text_encoder import WanT5EncoderModel
from wan.models.wan_vae import AutoencoderKLWan
from wan.models.wan_image_encoder import CLIPModel
from wan.pipeline.wan_inference_long_pipeline import WanI2VTalkingInferenceLongPipeline
from wan.utils.fp8_optimization import replace_parameters_by_name, convert_weight_dtype_wrapper, convert_model_weight_to_float8
from wan.utils.utils import get_image_to_video_latent, save_videos_grid
import numpy as np
import librosa
import datetime
import random
import math
import subprocess
from huggingface_hub import snapshot_download
import requests
import shutil
# --- 全域設定 ---
if torch.cuda.is_available():
device = "cuda"
if torch.cuda.get_device_capability()[0] >= 8:
dtype = torch.bfloat16
else:
dtype = torch.float16
else:
device = "cpu"
dtype = torch.float32
def filter_kwargs(cls, kwargs):
"""過濾掉不屬於類別建構函式的關鍵字參數"""
import inspect
sig = inspect.signature(cls.__init__)
valid_params = set(sig.parameters.keys()) - {'self', 'cls'}
filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
return filtered_kwargs
def download_file(url, local_path):
"""從 URL 下載檔案,如果 URL 是本地路徑則直接返回"""
if url.startswith(('http://', 'https://')):
print(f"從 {url} 下載檔案中...")
try:
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print(f"檔案已儲存至 {local_path}")
return local_path
except requests.exceptions.RequestException as e:
print(f"錯誤:無法下載檔案 {url}。 {e}")
return None
elif os.path.exists(url):
print(f"使用本地檔案: {url}")
return url
else:
print(f"錯誤:檔案或 URL 不存在: {url}")
return None
def setup_models(repo_root, model_version):
"""載入所有必要的模型和設定"""
pretrained_model_name_or_path = os.path.join(repo_root, "Wan2.1-Fun-V1.1-1.3B-InP")
pretrained_wav2vec_path = os.path.join(repo_root, "wav2vec2-base-960h")
config_path = os.path.join(repo_root, "deepspeed_config/wan2.1/wan_civitai.yaml")
if not os.path.exists(config_path):
raise FileNotFoundError(f"設定檔未找到: {config_path}")
config = OmegaConf.load(config_path)
sampler_name = "Flow"
print("正在載入 Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(os.path.join(pretrained_model_name_or_path, config['text_encoder_kwargs'].get('tokenizer_subpath', 'tokenizer')))
print("正在載入 Text Encoder...")
text_encoder = WanT5EncoderModel.from_pretrained(
os.path.join(pretrained_model_name_or_path, config['text_encoder_kwargs'].get('text_encoder_subpath', 'text_encoder')),
additional_kwargs=OmegaConf.to_container(config['text_encoder_kwargs']),
low_cpu_mem_usage=True,
torch_dtype=dtype,
).eval()
print("正在載入 VAE...")
vae = AutoencoderKLWan.from_pretrained(
os.path.join(pretrained_model_name_or_path, config['vae_kwargs'].get('vae_subpath', 'vae')),
additional_kwargs=OmegaConf.to_container(config['vae_kwargs']),
)
print("正在載入 Wav2Vec...")
wav2vec_processor = Wav2Vec2Processor.from_pretrained(pretrained_wav2vec_path)
wav2vec = Wav2Vec2Model.from_pretrained(pretrained_wav2vec_path).to("cpu")
print("正在載入 CLIP Image Encoder...")
clip_image_encoder = CLIPModel.from_pretrained(os.path.join(pretrained_model_name_or_path, config['image_encoder_kwargs'].get('image_encoder_subpath', 'image_encoder'))).eval()
print("正在載入 Transformer 3D 基礎模型...")
transformer3d = WanTransformer3DFantasyModel.from_pretrained(
os.path.join(pretrained_model_name_or_path, config['transformer_additional_kwargs'].get('transformer_subpath', 'transformer')),
transformer_additional_kwargs=OmegaConf.to_container(config['transformer_additional_kwargs']),
low_cpu_mem_usage=False,
torch_dtype=dtype,
)
# <<< FIX 1: 載入 StableAvatar 專用權重 >>>
if model_version == "square":
transformer_path = os.path.join(repo_root, "StableAvatar-1.3B", "transformer3d-square.pt")
else: # rec_vec
transformer_path = os.path.join(repo_root, "StableAvatar-1.3B", "transformer3d-rec-vec.pt")
if os.path.exists(transformer_path):
print(f"正在從 {transformer_path} 載入 StableAvatar 權重...")
state_dict = torch.load(transformer_path, map_location="cpu")
state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict
m, u = transformer3d.load_state_dict(state_dict, strict=False)
print(f"StableAvatar 權重載入成功。 Missing keys: {len(m)}; Unexpected keys: {len(u)}")
else:
raise FileNotFoundError(f"找不到 StableAvatar 權重檔案:{transformer_path}。請確保模型已完整下載。")
# <<< END OF FIX 1 >>>
scheduler_class = { "Flow": FlowMatchEulerDiscreteScheduler }[sampler_name]
scheduler = scheduler_class(**filter_kwargs(scheduler_class, OmegaConf.to_container(config['scheduler_kwargs'])))
print("正在建立 Pipeline...")
pipeline = WanI2VTalkingInferenceLongPipeline(
tokenizer=tokenizer, text_encoder=text_encoder, vae=vae,
transformer=transformer3d, clip_image_encoder=clip_image_encoder,
scheduler=scheduler, wav2vec_processor=wav2vec_processor, wav2vec=wav2vec,
)
return pipeline, transformer3d, vae
def run_inference(
pipeline, transformer3d, vae, image_path, audio_path, prompt,
negative_prompt, seed, output_filename, gpu_memory_mode="model_cpu_offload",
width=512, height=512, num_inference_steps=50, fps=25, **kwargs
):
"""執行推理以生成影片。"""
if seed < 0:
seed = random.randint(0, np.iinfo(np.int32).max)
print(f"使用的種子: {seed}")
if gpu_memory_mode == "sequential_cpu_offload":
pipeline.enable_sequential_cpu_offload(device=device)
elif gpu_memory_mode == "model_cpu_offload":
pipeline.enable_model_cpu_offload(device=device)
else:
pipeline.to(device=device)
with torch.no_grad():
print("正在準備輸入資料...")
# 由於 get_image_to_video_latent 內部有自己的 vae.config 引用,所以此處警告可忽略
video_length = 81
input_video, input_video_mask, clip_image = get_image_to_video_latent(image_path, None, video_length=video_length, sample_size=[height, width])
sr = 16000
vocal_input, _ = librosa.load(audio_path, sr=sr)
print("Pipeline 執行中... 這可能需要一些時間。")
sample = pipeline(
prompt, num_frames=video_length, negative_prompt=negative_prompt,
width=width, height=height, guidance_scale=6.0,
generator=torch.Generator().manual_seed(seed), num_inference_steps=num_inference_steps,
video=input_video, mask_video=input_video_mask, clip_image=clip_image,
text_guide_scale=3.0, audio_guide_scale=5.0, vocal_input_values=vocal_input,
motion_frame=25, fps=fps, sr=sr, cond_file_path=image_path,
overlap_window_length=10, seed=seed, overlapping_weight_scheme="uniform",
).videos
print("正在儲存影片...")
os.makedirs("outputs", exist_ok=True)
video_path = os.path.join("outputs", f"{output_filename}.mp4")
save_videos_grid(sample, video_path, fps=fps)
output_video_with_audio = os.path.join("outputs", f"{output_filename}_audio.mp4")
print("正在將音訊合併到影片中...")
subprocess.run([
"ffmpeg", "-y", "-loglevel", "quiet", "-i", video_path, "-i", audio_path,
"-c:v", "copy", "-c:a", "aac", "-strict", "experimental",
output_video_with_audio
], check=True)
os.remove(video_path)
print(f"✅ 生成完成!影片已儲存至: {output_video_with_audio}")
return output_video_with_audio, seed
def main():
parser = argparse.ArgumentParser(description="StableAvatar 命令列推理工具")
parser.add_argument('--prompt', type=str, default="a beautiful woman is talking, masterpiece, best quality", help='正面提示詞')
parser.add_argument('--input_image', type=str, default="example_case/case-6/reference.png", help='輸入圖片的路徑或 URL')
parser.add_argument('--input_audio', type=str, default="example_case/case-6/audio.wav", help='輸入音訊的路徑或 URL')
parser.add_argument('--seed', type=int, default=42, help='隨機種子,-1 表示隨機')
parser.add_argument('--negative_prompt', type=str, default="vivid color, static, blur details, text, style, painting, picture, still, gray, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, malformed, deformed, bad anatomy, fused fingers, still image, messy background, many people in the background, walking backwards", help='負面提示詞')
parser.add_argument('--width', type=int, default=512, help='影片寬度')
parser.add_argument('--height', type=int, default=512, help='影片高度')
parser.add_argument('--num_inference_steps', type=int, default=50, help='推理步數')
parser.add_argument('--fps', type=int, default=25, help='影片幀率')
parser.add_argument('--gpu_memory_mode', type=str, default="model_cpu_offload", choices=["Normal", "model_cpu_offload"], help='GPU 記憶體優化模式')
parser.add_argument('--model_version', type=str, default="square", choices=["square", "rec_vec"], help='StableAvatar 模型版本')
args = parser.parse_args()
print("--- 步驟 1: 正在檢查並下載模型 ---")
repo_root = snapshot_download(
repo_id="FrancisRing/StableAvatar",
allow_patterns=["StableAvatar-1.3B/*", "Wan2.1-Fun-V1.1-1.3B-InP/*", "wav2vec2-base-960h/*", "example_case/**", "deepspeed_config/**"],
)
print("模型檔案已準備就緒。")
print("\n--- 步驟 2: 正在處理輸入檔案 ---")
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
temp_dir = f"temp_{timestamp}"
os.makedirs(temp_dir, exist_ok=True)
# <<< FIX 2: 穩健的路徑處理 >>>
# 處理圖片路徑
input_image_path = args.input_image
# 如果不是 URL 且不是絕對路徑,就視為相對於 repo_root 的路徑
if not input_image_path.startswith(('http', '/')):
input_image_path = os.path.join(repo_root, input_image_path)
local_image_path = os.path.join(temp_dir, os.path.basename(input_image_path))
final_image_path = download_file(input_image_path, local_image_path)
if not final_image_path:
shutil.rmtree(temp_dir); return
# 處理音訊路徑
input_audio_path = args.input_audio
if not input_audio_path.startswith(('http', '/')):
input_audio_path = os.path.join(repo_root, input_audio_path)
local_audio_path = os.path.join(temp_dir, os.path.basename(input_audio_path))
final_audio_path = download_file(input_audio_path, local_audio_path)
if not final_audio_path:
shutil.rmtree(temp_dir); return
# <<< END OF FIX 2 >>>
print("\n--- 步驟 3: 正在載入模型 ---")
pipeline, transformer3d, vae = setup_models(repo_root, args.model_version)
print("模型載入完成。")
print("\n--- 步驟 4: 開始執行推理 ---")
run_inference(
pipeline=pipeline, transformer3d=transformer3d, vae=vae,
image_path=final_image_path, audio_path=final_audio_path,
prompt=args.prompt, negative_prompt=args.negative_prompt,
seed=args.seed, output_filename=f"output_{timestamp}",
gpu_memory_mode=args.gpu_memory_mode, width=args.width,
height=args.height, num_inference_steps=args.num_inference_steps,
fps=args.fps
)
print("\n--- 步驟 5: 清理暫存檔案 ---")
try:
shutil.rmtree(temp_dir)
print("暫存檔案已刪除。")
except OSError as e:
print(f"錯誤:無法刪除暫存目錄 {temp_dir}: {e}")
if __name__ == "__main__":
main()
|