kaimoviestud

Paused

App Files Files Community

seawolf2357 commited on Aug 13, 2024

Commit

46cdca3

verified ·

1 Parent(s): f3cb7e2

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -113

app.py CHANGED Viewed

@@ -33,7 +33,6 @@ if hf_token:
 else:
     print("Warning: HF_TOKEN not found in environment variables. You may encounter authentication issues.")
 def download_model():
     REPO_ID = 'Doubiiu/DynamiCrafter_1024'
     filename_list = ['model.ckpt']
@@ -45,11 +44,11 @@ def download_model():
             hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)
 download_model()
-ckpt_path='checkpoints/dynamicrafter_1024_v1/model.ckpt'
-config_file='configs/inference_1024_v1.0.yaml'
 config = OmegaConf.load(config_file)
 model_config = config.pop("model", OmegaConf.create())
-model_config['params']['unet_config']['params']['use_checkpoint']=False
 model = instantiate_from_config(model_config)
 assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
 model = load_model_checkpoint(model, ckpt_path)
@@ -67,11 +66,18 @@ flux_pipe = FluxPipeline.from_pretrained(
 )
 flux_pipe.enable_model_cpu_offload()
 def generate_image_from_text(prompt, seed=0):
     generator = torch.Generator("cpu").manual_seed(seed)
     image = flux_pipe(
-        prompt,
         height=576,
         width=1024,
         guidance_scale=3.5,
@@ -83,158 +89,96 @@ def generate_image_from_text(prompt, seed=0):
 @spaces.GPU(duration=600)
 def infer(image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, video_length=2):
-    # 한글 입력 감지 및 번역
-    if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
-        translated = translator(prompt, max_length=512)[0]['translation_text']
-        prompt = translated
-        print(f"Translated prompt: {prompt}")
     resolution = (576, 1024)
     save_fps = 8
     seed_everything(seed)
     transform = transforms.Compose([
-        transforms.Resize(min(resolution)),
         transforms.CenterCrop(resolution),
-        ])
     torch.cuda.empty_cache()
-    print('Start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
     start = time.time()
     if steps > 60:
         steps = 60
     batch_size = 1
     channels = model.model.diffusion_model.out_channels
-    frames = int(video_length * save_fps)  # 비디오 길이에 따른 프레임 수 계산
     h, w = resolution[0] // 8, resolution[1] // 8
     noise_shape = [batch_size, channels, frames, h, w]
-    # 텍스트 조건 설정
     with torch.no_grad(), torch.cuda.amp.autocast():
-        text_emb = model.get_learned_conditioning([prompt])
         img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
         img_tensor = (img_tensor / 255. - 0.5) * 2
         image_tensor_resized = transform(img_tensor).unsqueeze(0)  # bchw
         z = get_latent_z(model, image_tensor_resized.unsqueeze(2)) #bc,1,hw
         img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
         cond_images = model.embedder(img_tensor.unsqueeze(0)) # blc
         img_emb = model.image_proj_model(cond_images)
         imtext_cond = torch.cat([text_emb, img_emb], dim=1)
         fs = torch.tensor([fs], dtype=torch.long, device=model.device)
         cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
-        # 추론 실행
         batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
         video_path = './output.mp4'
         save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
     return video_path
-@spaces.GPU(duration=300)
-def infer_t2v(prompt, video_prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, video_length=2):
-    # 이미지 생성
-    image = generate_image_from_text(prompt, seed)
-    # 이미지를 numpy 배열로 변환
-    image_np = np.array(image)
-    # 비디오 생성을 위해 기존 infer 함수 호출
-    return infer(image_np, video_prompt, steps, cfg_scale, eta, fs, seed, video_length)
-i2v_examples = [
-    ['prompts/1024/astronaut04.png', 'a man in an astronaut suit playing a guitar', 30, 7.5, 1.0, 6, 123, 2],
-]
 css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
-def generate_only_image(prompt, seed=123):
-    # 이미지 생성
-    image = generate_image_from_text(prompt, seed)
-    # PIL 이미지로 변환 후 반환
-    return Image.fromarray(np.array(image))
 with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
     gr.Markdown("kAI 무비 스튜디오")
-    with gr.Tab(label='Image(+Text) Generation'):
         with gr.Column():
             with gr.Row():
-                with gr.Column():
-                    img_input_text = gr.Text(label='Image Generation Prompt')
-                    img_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
-                    img_generate_btn = gr.Button("Generate Image")
-                with gr.Row():
-                    img_output_image = gr.Image(label="Generated Image")
             img_generate_btn.click(
                 inputs=[img_input_text, img_seed],
                 outputs=[img_output_image],
-                fn=generate_only_image
-        )
     with gr.Tab(label='Image to Video Generation'):
         with gr.Column():
             with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
-                    with gr.Row():
-                        i2v_input_text = gr.Text(label='Prompts')
-                    with gr.Row():
-                        i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
-                        i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
-                        i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
-                    with gr.Row():
-                        i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=30)
-                        i2v_motion = gr.Slider(minimum=5, maximum=20, step=1, elem_id="i2v_motion", label="FPS", value=8)
-                    with gr.Row():
-                        i2v_video_length = gr.Slider(minimum=2, maximum=8, step=1, elem_id="i2v_video_length", label="Video Length (seconds)", value=2)
-                    i2v_end_btn = gr.Button("Generate")
-                with gr.Row():
-                    i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
-            gr.Examples(examples=i2v_examples,
-                        inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_video_length],
-                        outputs=[i2v_output_video],
-                        fn = infer,
-                        cache_examples=True,
             )
-        i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_video_length],
-                        outputs=[i2v_output_video],
-                        fn = infer
-        )
     with gr.Tab(label='Text to Video Generation'):
         with gr.Column():
             with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        t2v_input_text = gr.Text(label='Image Generation Prompt')  # 이미지 생성을 위한 프롬프트 입력
-                        t2v_video_prompt = gr.Text(label='Video Generation Prompt')  # 비디오 생성을 위한 프롬프트 입력
-                    with gr.Row():
-                        t2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
-                        t2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0)
-                        t2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5)
-                    with gr.Row():
-                        t2v_steps = gr.Slider(minimum=1, maximum=50, step=1, label="Sampling steps", value=30)
-                        t2v_motion = gr.Slider(minimum=5, maximum=20, step=1, label="FPS", value=8)
-                    with gr.Row():
-                        t2v_video_length = gr.Slider(minimum=2, maximum=8, step=1, label="Video Length (seconds)", value=2)
-                    t2v_end_btn = gr.Button("Generate")
-                with gr.Row():
-                    t2v_output_video = gr.Video(label="Generated Video", autoplay=True, show_share_button=True)
-        t2v_end_btn.click(
-                inputs=[t2v_input_text, t2v_video_prompt, t2v_steps, t2v_cfg_scale, t2v_eta, t2v_motion, t2v_seed, t2v_video_length],
-                outputs=[t2v_output_video],
                 fn=infer_t2v
-        )
-dynamicrafter_iface.queue(max_size=12).launch(show_api=True)

 else:
     print("Warning: HF_TOKEN not found in environment variables. You may encounter authentication issues.")
 def download_model():
     REPO_ID = 'Doubiiu/DynamiCrafter_1024'
     filename_list = ['model.ckpt']
             hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)
 download_model()
+ckpt_path = 'checkpoints/dynamicrafter_1024_v1/model.ckpt'
+config_file = 'configs/inference_1024_v1.0.yaml'
 config = OmegaConf.load(config_file)
 model_config = config.pop("model", OmegaConf.create())
+model_config['params']['unet_config']['params']['use_checkpoint'] = False
 model = instantiate_from_config(model_config)
 assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
 model = load_model_checkpoint(model, ckpt_path)
 )
 flux_pipe.enable_model_cpu_offload()
+def translate_prompt(prompt):
+    # 한글 입력 감지 및 번역
+    if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
+        translated = translator(prompt, max_length=512)[0]['translation_text']
+        return translated
+    return prompt
 def generate_image_from_text(prompt, seed=0):
+    translated_prompt = translate_prompt(prompt)
     generator = torch.Generator("cpu").manual_seed(seed)
     image = flux_pipe(
+        translated_prompt,
         height=576,
         width=1024,
         guidance_scale=3.5,
 @spaces.GPU(duration=600)
 def infer(image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, video_length=2):
+    translated_prompt = translate_prompt(prompt)
+    print(f"Translated prompt: {translated_prompt}")
     resolution = (576, 1024)
     save_fps = 8
     seed_everything(seed)
     transform = transforms.Compose([
+        transforms.Resize(min(resolution), antialias=True),
         transforms.CenterCrop(resolution),
+    ])
     torch.cuda.empty_cache()
+    print('Start:', translated_prompt, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
     start = time.time()
     if steps > 60:
         steps = 60
     batch_size = 1
     channels = model.model.diffusion_model.out_channels
+    frames = int(video_length * save_fps)
     h, w = resolution[0] // 8, resolution[1] // 8
     noise_shape = [batch_size, channels, frames, h, w]
     with torch.no_grad(), torch.cuda.amp.autocast():
+        text_emb = model.get_learned_conditioning([translated_prompt])
         img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
         img_tensor = (img_tensor / 255. - 0.5) * 2
         image_tensor_resized = transform(img_tensor).unsqueeze(0)  # bchw
         z = get_latent_z(model, image_tensor_resized.unsqueeze(2)) #bc,1,hw
         img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
         cond_images = model.embedder(img_tensor.unsqueeze(0)) # blc
         img_emb = model.image_proj_model(cond_images)
         imtext_cond = torch.cat([text_emb, img_emb], dim=1)
         fs = torch.tensor([fs], dtype=torch.long, device=model.device)
         cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
         batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
         video_path = './output.mp4'
         save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
     return video_path
 css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
 with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
     gr.Markdown("kAI 무비 스튜디오")
+    with gr.Tab(label='Image Generation'):
         with gr.Column():
             with gr.Row():
+                img_input_text = gr.Text(label='Image Generation Prompt')
+                img_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
+                img_generate_btn = gr.Button("Generate Image")
+            with gr.Row():
+                img_output_image = gr.Image(label="Generated Image")
             img_generate_btn.click(
                 inputs=[img_input_text, img_seed],
                 outputs=[img_output_image],
+                fn=generate_image_from_text
+            )
     with gr.Tab(label='Image to Video Generation'):
         with gr.Column():
             with gr.Row():
+                video_input_image = gr.Image(label="Input Image for Video", tool="input")
+                video_prompt = gr.Text(label='Video Generation Prompt')
+                video_seed = gr.Slider(label='Random Seed', minimum=0, maximum 10000, step=1, value=123)
+                video_steps = gr.Slider(label="Sampling steps", minimum=1, maximum=50, step=1, value=30)
+                video_cfg_scale = gr.Slider(label='CFG Scale', minimum=1.0, maximum=15.0, step=0.5, value=7.5)
+                video_eta = gr.Slider(label='ETA', minimum=0.0, maximum=1.0, step=0.1, value=1.0)
+                video_fs = gr.Slider(label='FS', minimum=1, maximum=10, step=1, value=3)
+                video_length = gr.Slider(label="Video Length (seconds)", minimum=2, maximum=8, step=1, value=2)
+                video_generate_btn = gr.Button("Generate Video")
+            with gr.Row():
+                video_output = gr.Video(label="Generated Video", autoplay=True, show_share_button=True)
+            video_generate_btn.click(
+                inputs=[video_input_image, video_prompt, video_seed, video_steps, video_cfg_scale, video_eta, video_fs, video_length],
+                outputs=[video_output],
+                fn=infer
             )
     with gr.Tab(label='Text to Video Generation'):
         with gr.Column():
             with gr.Row():
+                video_prompt = gr.Text(label='Video Generation Prompt')
+                video_seed = gr.Slider(label='Random Seed', minimum=0, maximum 10000, step=1, value=123)
+                video_steps = gr.Slider(label="Sampling steps", minimum=1, maximum=50, step=1, value=30)
+                video_cfg_scale = gr.Slider(label='CFG Scale', minimum=1.0, maximum=15.0, step=0.5, value=7.5)
+                video_eta = gr.Slider(label='ETA', minimum=0.0, maximum=1.0, step=0.1, value=1.0)
+                video_fs = gr.Slider(label='FS', minimum=1, maximum 10, step=1, value=3)
+                video_length = gr.Slider(label="Video Length (seconds)", minimum=2, maximum 8, step=1, value=2)
+                video_generate_btn = gr.Button("Generate Video")
+            with gr.Row():
+                video_output = gr.Video(label="Generated Video", autoplay=True, show_share_button=True)
+            video_generate_btn.click(
+                inputs=[video_prompt, video_seed, video_steps, video_cfg_scale, video_eta, video_fs, video_length],
+                outputs=[video_output],
                 fn=infer_t2v
+            )
+dynamicrafter_iface.launch(show_api=True)