| import os | |
| import shutil | |
| from huggingface_hub import snapshot_download | |
| import gradio as gr | |
| os.chdir(os.path.dirname(os.path.abspath(__file__))) | |
| from scripts.inference import inference_process | |
| import argparse | |
| import uuid | |
| is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False | |
| if(not is_shared_ui): | |
| hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") | |
| def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)): | |
| if is_shared_ui: | |
| raise gr.Error("This Space only works in duplicated instances") | |
| unique_id = uuid.uuid4() | |
| args = argparse.Namespace( | |
| config='configs/inference/default.yaml', | |
| source_image=source_image, | |
| driving_audio=driving_audio, | |
| output=f'output-{unique_id}.mp4', | |
| pose_weight=1.0, | |
| face_weight=1.0, | |
| lip_weight=1.0, | |
| face_expand_ratio=1.2, | |
| checkpoint=None | |
| ) | |
| inference_process(args) | |
| return f'output-{unique_id}.mp4' | |
| css = ''' | |
| div#warning-ready { | |
| background-color: #ecfdf5; | |
| padding: 0 16px 16px; | |
| margin: 20px 0; | |
| color: #030303!important; | |
| } | |
| div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { | |
| color: #057857!important; | |
| } | |
| div#warning-duplicate { | |
| background-color: #ebf5ff; | |
| padding: 0 16px 16px; | |
| margin: 20px 0; | |
| color: #030303!important; | |
| } | |
| div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { | |
| color: #0f4592!important; | |
| } | |
| div#warning-duplicate strong { | |
| color: #0f4592; | |
| } | |
| p.actions { | |
| display: flex; | |
| align-items: center; | |
| margin: 20px 0; | |
| } | |
| div#warning-duplicate .actions a { | |
| display: inline-block; | |
| margin-right: 10px; | |
| } | |
| .dark #warning-duplicate { | |
| background-color: #0c0c0c !important; | |
| border: 1px solid white !important; | |
| } | |
| ''' | |
| with gr.Blocks(css=css) as demo: | |
| if is_shared_ui: | |
| top_description = gr.HTML(f''' | |
| <div class="gr-prose"> | |
| <h2 class="custom-color"><svg xmlns="http://www.w3.org/2000/svg" width="18px" height="18px" style="margin-right: 0px;display: inline-block;"fill="none"><path fill="#fff" d="M7 13.2a6.3 6.3 0 0 0 4.4-10.7A6.3 6.3 0 0 0 .6 6.9 6.3 6.3 0 0 0 7 13.2Z"/><path fill="#fff" fill-rule="evenodd" d="M7 0a6.9 6.9 0 0 1 4.8 11.8A6.9 6.9 0 0 1 0 7 6.9 6.9 0 0 1 7 0Zm0 0v.7V0ZM0 7h.6H0Zm7 6.8v-.6.6ZM13.7 7h-.6.6ZM9.1 1.7c-.7-.3-1.4-.4-2.2-.4a5.6 5.6 0 0 0-4 1.6 5.6 5.6 0 0 0-1.6 4 5.6 5.6 0 0 0 1.6 4 5.6 5.6 0 0 0 4 1.7 5.6 5.6 0 0 0 4-1.7 5.6 5.6 0 0 0 1.7-4 5.6 5.6 0 0 0-1.7-4c-.5-.5-1.1-.9-1.8-1.2Z" clip-rule="evenodd"/><path fill="#000" fill-rule="evenodd" d="M7 2.9a.8.8 0 1 1 0 1.5A.8.8 0 0 1 7 3ZM5.8 5.7c0-.4.3-.6.6-.6h.7c.3 0 .6.2.6.6v3.7h.5a.6.6 0 0 1 0 1.3H6a.6.6 0 0 1 0-1.3h.4v-3a.6.6 0 0 1-.6-.7Z" clip-rule="evenodd"/></svg> | |
| Attention: this Space need to be duplicated to work</h2> | |
| <p class="main-message custom-color"> | |
| To make it work, <strong>duplicate the Space</strong> and run it on your own profile using a <strong>private</strong> GPU.<br /> | |
| An L4 costs <strong>US$0.80/h</strong> | |
| </p> | |
| <p class="actions custom-color"> | |
| <a href="https://huggingface.co/spaces/{os.environ['SPACE_ID']}?duplicate=true"> | |
| <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-lg-dark.svg" alt="Duplicate this Space" /> | |
| </a> | |
| to start generate your talking head | |
| </p> | |
| </div> | |
| ''', elem_id="warning-duplicate") | |
| gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation") | |
| gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab") | |
| gr.Markdown(""" | |
| Hallo has a few simple requirements for input data: | |
| For the source image: | |
| 1. It should be cropped into squares. | |
| 2. The face should be the main focus, making up 50%-70% of the image. | |
| 3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles). | |
| For the driving audio: | |
| 1. It must be in WAV format. | |
| 2. It must be in English since our training datasets are only in this language. | |
| 3. Ensure the vocals are clear; background music is acceptable. | |
| We have provided some [samples](https://huggingface.co/datasets/fudan-generative-ai/hallo_inference_samples) for your reference. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| avatar_face = gr.Image(type="filepath", label="Face") | |
| driving_audio = gr.Audio(type="filepath", label="Driving audio") | |
| generate = gr.Button("Generate") | |
| with gr.Column(): | |
| output_video = gr.Video(label="Your talking head") | |
| generate.click( | |
| fn=run_inference, | |
| inputs=[avatar_face, driving_audio], | |
| outputs=output_video | |
| ) | |
| demo.launch() |