DiffRhythm2 / app.py
ASLP-lab's picture
Update app.py
4540ea0 verified
import gradio as gr
import torch
import json
import random
import numpy as np
import base64
import spaces
from diffrhythm2.utils import (
prepare_model,
parse_lyrics,
get_audio_prompt,
get_text_prompt,
inference,
inference_stream
)
lrc_tokenizer = None
MAX_SEED = np.iinfo(np.int32).max
device='cuda'
dtype=torch.float16
diffrhythm2, mulan, lrc_tokenizer, decoder = prepare_model("ASLP-Lab/DiffRhythm2", device, dtype)
@spaces.GPU
def infer_music(
lrc,
current_prompt_type,
audio_prompt=None,
text_prompt=None,
seed=42,
randomize_seed=False,
steps=16,
cfg_strength=1.0,
file_type='wav',
odeint_method='euler',
device='cuda'
):
if randomize_seed:
seed = random.randint(0, MAX_SEED)
torch.manual_seed(seed)
print(seed, current_prompt_type)
try:
lrc_prompt = parse_lyrics(lrc_tokenizer, lrc)
lrc_prompt = torch.tensor(sum(lrc_prompt, []), dtype=torch.long, device=device)
if current_prompt_type == "audio":
style_prompt = get_audio_prompt(mulan, audio_prompt, device, dtype)
else:
style_prompt = get_text_prompt(mulan, text_prompt, device, dtype)
except Exception as e:
raise gr.Error(f"Error: {str(e)}")
style_prompt = style_prompt.to(dtype)
generate_song = inference(
model=diffrhythm2,
decoder=decoder,
text=lrc_prompt,
style_prompt=style_prompt,
sample_steps=steps,
cfg_strength=cfg_strength,
odeint_method=odeint_method,
duration=240,
file_type=file_type
)
return generate_song
# for block in inference_stream(
# model=diffrhythm2,
# decoder=decoder,
# text=lrc_prompt,
# style_prompt=style_prompt,
# sample_steps=steps,
# cfg_strength=cfg_strength,
# odeint_method=odeint_method,
# duration=240,
# file_type=file_type
# ):
# yield block
css = """
/* 固定文本域高度并强制滚动条 */
.lyrics-scroll-box textarea {
height: 405px !important; /* 固定高度 */
max-height: 500px !important; /* 最大高度 */
overflow-y: auto !important; /* 垂直滚动 */
white-space: pre-wrap; /* 保留换行 */
line-height: 1.5; /* 行高优化 */
}
.gr-examples {
background: transparent !important;
border: 1px solid #e0e0e0 !important;
border-radius: 8px;
margin: 1rem 0 !important;
padding: 1rem !important;
}
"""
import base64
def image_to_base64(path):
with open(path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
with gr.Blocks(css=css) as demo:
gr.HTML(f"""
<div style="flex: 1; text-align: center;">
<div style="font-size: 2em; font-weight: bold; text-align: center; margin-bottom: 5px">
Di♪♪Rhythm 2 (谛韵)
</div>
<div style="display:flex; justify-content: center; column-gap:4px;">
<a href="https://arxiv.org/pdf/2510.22950">
<img src='https://img.shields.io/badge/Arxiv-Paper-blue'>
</a>
<a href="https://github.com/ASLP-lab/DiffRhythm2">
<img src='https://img.shields.io/badge/GitHub-Repo-green'>
</a>
<a href="https://aslp-lab.github.io/DiffRhythm2.github.io/">
<img src='https://img.shields.io/badge/Project-Page-brown'>
</a>
</div>
</div>
""")
with gr.Tabs() as tabs:
# page 1
with gr.Tab("Music Generate", id=0):
with gr.Row():
with gr.Column():
lrc = gr.Textbox(
label="Lyrics",
placeholder="Input the full lyrics",
lines=12,
max_lines=50,
elem_classes="lyrics-scroll-box",
value="""[start]
[intro]
[verse]
Thought I heard your voice yesterday
When I turned around to say
That I loved you baby
I realize it was juss my mind
Played tricks on me
And it seems colder lately at night
And I try to sleep with the lights on
Every time the phone rings
I pray to God it's you
And I just can't believe
That we're through
[chorus]
I miss you
There's no other way to say it
And I can't deny it
I miss you
It's so easy to see
I miss you and me
[verse]
Is it turning over this time
Have we really changed our minds about each other's love
All the feelings that we used to share
I refuse to believe
That you don't care
[chorus]
I miss you
There's no other way to say it
And I and I can't deny it
I miss you
[verse]
It's so easy to see
I've got to gather myself as together
I've been through worst kinds of weather
If it's over now
[outro]"""
)
current_prompt_type = gr.State(value="text")
with gr.Tabs() as inside_tabs:
with gr.Tab("Text Prompt"):
text_prompt = gr.Textbox(
label="Text Prompt",
value="Pop, Piano, Bass, Drums, Happy",
placeholder="Enter the Text Prompt, eg: emotional piano pop",
)
with gr.Tab("Audio Prompt"):
audio_prompt = gr.Audio(label="Audio Prompt", type="filepath")
def update_prompt_type(evt: gr.SelectData):
return "text" if evt.index == 0 else "audio"
inside_tabs.select(
fn=update_prompt_type,
outputs=current_prompt_type
)
with gr.Column():
with gr.Accordion("Best Practices Guide", open=True):
gr.Markdown("""
1. **Lyrics Format Requirements**
- Each line must follow: `Lyric content`
- Example of valid format:
```
[intro]
[verse]
Thought I heard your voice yesterday
When I turned around to say
```
2. **Audio Prompt Requirements**
- Reference audio should be ≥ 1 second, Audio >10 seconds will be randomly clipped into 10 seconds
- For optimal results, the 10-second clips should be carefully selected
- Shorter clips may lead to incoherent generation
3. **Supported Languages**
- Chinese and English
**Due to issues with Gradio's streaming audio output, we will update the streaming feature in the future. Please stay tuned!**
""")
lyrics_btn = gr.Button("Generate", variant="primary")
# audio_output = gr.Gallery(label="Audio Results")
audio_output = gr.Audio(label="Audio Result", elem_id="audio_output")
with gr.Accordion("Advanced Settings", open=False):
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
steps = gr.Slider(
minimum=10,
maximum=100,
value=16,
step=1,
label="Diffusion Steps",
interactive=True,
elem_id="step_slider"
)
cfg_strength = gr.Slider(
minimum=1,
maximum=10,
value=1.3,
step=0.5,
label="CFG Strength",
interactive=True,
elem_id="step_slider"
)
odeint_method = gr.Radio(["euler", "midpoint", "rk4","implicit_adams"], label="ODE Solver", value="euler")
file_type = gr.Dropdown(["wav", "mp3", "ogg"], label="Output Format", value="mp3")
# gr.Examples(
# examples=[
# ["src/prompt/classic_cn.wav"],
# ["src/prompt/classic_en.wav"],
# ["src/prompt/country_cn.wav"],
# ["src/prompt/country_en.wav"],
# ["src/prompt/jazz_cn.wav"],
# ["src/prompt/jazz_en.wav"],
# ["src/prompt/pop_cn.wav"],
# ["src/prompt/pop_en.wav"],
# ["src/prompt/rap_cn.wav"],
# ["src/prompt/rap_en.wav"],
# ["src/prompt/rock_cn.wav"],
# ["src/prompt/rock_en.wav"]
# ],
# inputs=[audio_prompt],
# label="Audio Examples",
# examples_per_page=12,
# elem_id="audio-examples-container"
# )
# gr.Examples(
# examples=[
# ["Pop Emotional Piano"],
# ["流行 情感 钢琴"],
# ["Indie folk ballad, coming-of-age themes, acoustic guitar picking with harmonica interludes"],
# ["独立民谣, 成长主题, 原声吉他弹奏与口琴间奏"]
# ],
# inputs=[text_prompt],
# label="Text Examples",
# examples_per_page=4,
# elem_id="text-examples-container"
# )
# gr.Examples(
# examples=[
# ["""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""],
# ["""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""],
# ["""[00:04.27]只因你太美 baby\n[00:08.95]只因你实在是太美 baby\n[00:13.99]只因你太美 baby\n[00:18.89]迎面走来的你让我如此蠢蠢欲动\n[00:20.88]这种感觉我从未有\n[00:21.79]Cause I got a crush on you who you\n[00:25.74]你是我的我是你的谁\n[00:28.09]再多一眼看一眼就会爆炸\n[00:30.31]再近一点靠近点快被融化\n[00:32.49]想要把你占为己有 baby\n[00:34.60]不管走到哪里\n[00:35.44]都会想起的人是你 you you\n[00:38.12]我应该拿你怎样\n[00:39.61]Uh 所有人都在看着你\n[00:42.36]我的心总是不安\n[00:44.18]Oh 我现在已病入膏肓\n[00:46.63]Eh oh\n[00:47.84]难道真的因你而疯狂吗\n[00:51.57]我本来不是这种人\n[00:53.59]因你变成奇怪的人\n[00:55.77]第一次呀变成这样的我\n[01:01.23]不管我怎么去否认\n[01:03.21]只因你太美 baby\n[01:11.46]只因你实在是太美 baby\n[01:16.75]只因你太美 baby\n[01:21.09]Oh eh oh\n[01:22.82]现在确认地告诉我\n[01:25.26]Oh eh oh\n[01:27.31]你到底属于谁\n[01:29.98]Oh eh oh\n[01:31.70]现在确认地告诉我\n[01:34.45]Oh eh oh\n[01:36.35]你到底属于谁\n[01:37.65]就是现在告诉我\n[01:40.00]跟着那节奏 缓缓 make wave\n"""],
# ["""[00:16.55]倦鸟西归 竹影余晖\n[00:23.58]禅意心扉\n[00:27.32]待清风 拂开一池春水\n[00:30.83]你的手绘 玉色难褪\n[00:37.99]我端详飘散的韵味\n[00:40.65]落款壶底的名讳\n[00:42.92]如吻西施的嘴\n[00:45.14]风雅几回 总相随\n[00:52.32]皆因你珍贵\n[00:57.85]三千弱水 煮一杯\n[01:02.21]我只饮下你的美\n[01:04.92]千年余味 紫砂壶伴我醉\n[01:09.73]酿一世无悔\n[01:12.09]沏壶春水 翠烟飞\n[01:16.62]把盏不尽你的香味\n[01:20.06]邀月相对 愿今生同宿同归\n[01:26.43]只让你陪\n[01:46.12]茗香芳菲 世俗无追\n"""]
# ],
# inputs=[lrc],
# label="Lrc Examples",
# examples_per_page=4,
# elem_id="lrc-examples-container",
# )
tabs.select(
lambda s: None,
None,
None
)
# TODO add max_frames parameter for infer_music
lyrics_btn.click(
fn=infer_music,
inputs=[
lrc,
current_prompt_type,
audio_prompt,
text_prompt,
seed,
randomize_seed,
steps,
cfg_strength,
file_type,
odeint_method,
],
outputs=audio_output,
)
# demo.queue().launch(show_api=False, show_error=True)
if __name__ == "__main__":
demo.launch()