midi-composer

Sleeping

App Files Files Community

skytnt commited on Oct 1, 2024

Commit

fd012a7

1 Parent(s): 81592e1

torch

Browse files

Files changed (3) hide show

app.py +58 -92
midi_model.py +56 -16
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,79 +1,53 @@
 import argparse
 import glob
 import json
-import os.path
 import time
 import gradio as gr
 import numpy as np
-import onnxruntime as rt
 import tqdm
 from huggingface_hub import hf_hub_download
 import MIDI
 from midi_synthesizer import MidiSynthesizer
-from midi_tokenizer import MIDITokenizer
 MAX_SEED = np.iinfo(np.int32).max
 in_space = os.getenv("SYSTEM") == "spaces"
-def softmax(x, axis):
-    x_max = np.amax(x, axis=axis, keepdims=True)
-    exp_x_shifted = np.exp(x - x_max)
-    return exp_x_shifted / np.sum(exp_x_shifted, axis=axis, keepdims=True)
-def sample_top_p_k(probs, p, k, generator=None):
-    if generator is None:
-        generator = np.random
-    probs_idx = np.argsort(-probs, axis=-1)
-    probs_sort = np.take_along_axis(probs, probs_idx, -1)
-    probs_sum = np.cumsum(probs_sort, axis=-1)
-    mask = probs_sum - probs_sort > p
-    probs_sort[mask] = 0.0
-    mask = np.zeros(probs_sort.shape[-1])
-    mask[:k] = 1
-    probs_sort = probs_sort * mask
-    probs_sort /= np.sum(probs_sort, axis=-1, keepdims=True)
-    shape = probs_sort.shape
-    probs_sort_flat = probs_sort.reshape(-1, shape[-1])
-    probs_idx_flat = probs_idx.reshape(-1, shape[-1])
-    next_token = np.stack([generator.choice(idxs, p=pvals) for pvals, idxs in zip(probs_sort_flat, probs_idx_flat)])
-    next_token = next_token.reshape(*shape[:-1])
-    return next_token
-def generate(model, prompt=None, max_len=512, temp=1.0, top_p=0.98, top_k=20,
              disable_patch_change=False, disable_control_change=False, disable_channels=None, generator=None):
-    tokenizer = model[2]
     if disable_channels is not None:
         disable_channels = [tokenizer.parameter_ids["channel"][c] for c in disable_channels]
     else:
         disable_channels = []
-    if generator is None:
-        generator = np.random
     max_token_seq = tokenizer.max_token_seq
     if prompt is None:
-        input_tensor = np.full((1, max_token_seq), tokenizer.pad_id, dtype=np.int64)
         input_tensor[0, 0] = tokenizer.bos_id  # bos
     else:
         prompt = prompt[:, :max_token_seq]
         if prompt.shape[-1] < max_token_seq:
             prompt = np.pad(prompt, ((0, 0), (0, max_token_seq - prompt.shape[-1])),
                             mode="constant", constant_values=tokenizer.pad_id)
-        input_tensor = prompt
-    input_tensor = input_tensor[None, :, :]
     cur_len = input_tensor.shape[1]
-    bar = tqdm.tqdm(desc="generating", total=max_len - cur_len, disable=in_space)
     with bar:
         while cur_len < max_len:
             end = False
-            hidden = model[0].run(None, {'x': input_tensor})[0][:, -1]
-            next_token_seq = np.empty((1, 0), dtype=np.int64)
             event_name = ""
             for i in range(max_token_seq):
-                mask = np.zeros(tokenizer.vocab_size, dtype=np.int64)
                 if i == 0:
                     mask_ids = list(tokenizer.event_ids.values()) + [tokenizer.eos_id]
                     if disable_patch_change:
@@ -87,9 +61,9 @@ def generate(model, prompt=None, max_len=512, temp=1.0, top_p=0.98, top_k=20,
                     if param_name == "channel":
                         mask_ids = [i for i in mask_ids if i not in disable_channels]
                     mask[mask_ids] = 1
-                logits = model[1].run(None, {'x': next_token_seq, "hidden": hidden})[0][:, -1:]
-                scores = softmax(logits / temp, -1) * mask
-                sample = sample_top_p_k(scores, top_p, top_k, generator)
                 if i == 0:
                     next_token_seq = sample
                     eid = sample.item()
@@ -98,17 +72,17 @@ def generate(model, prompt=None, max_len=512, temp=1.0, top_p=0.98, top_k=20,
                         break
                     event_name = tokenizer.id_events[eid]
                 else:
-                    next_token_seq = np.concatenate([next_token_seq, sample], axis=1)
                     if len(tokenizer.events[event_name]) == i:
                         break
             if next_token_seq.shape[1] < max_token_seq:
-                next_token_seq = np.pad(next_token_seq, ((0, 0), (0, max_token_seq - next_token_seq.shape[-1])),
-                                        mode="constant", constant_values=tokenizer.pad_id)
-            next_token_seq = next_token_seq[None, :, :]
-            input_tensor = np.concatenate([input_tensor, next_token_seq], axis=1)
             cur_len += 1
             bar.update(1)
-            yield next_token_seq.reshape(-1)
             if end:
                 break
@@ -125,7 +99,7 @@ def run(model_name, tab, mid_seq, continuation_state, instruments, drum_kit, bpm
         reduce_cc_st, remap_track_channel, add_default_instr, remove_empty_channels, seed, seed_rand,
         gen_events, temp, top_p, top_k, allow_cc):
     model = models[model_name]
-    tokenizer = model[2]
     bpm = int(bpm)
     if time_sig == "auto":
         time_sig = None
@@ -147,7 +121,7 @@ def run(model_name, tab, mid_seq, continuation_state, instruments, drum_kit, bpm
     max_len = gen_events
     if seed_rand:
         seed = np.random.randint(0, MAX_SEED)
-    generator = np.random.RandomState(seed)
     disable_patch_change = False
     disable_channels = None
     if tab == 0:
@@ -203,22 +177,24 @@ def run(model_name, tab, mid_seq, continuation_state, instruments, drum_kit, bpm
         init_msgs += [create_msg("visualizer_clear", tokenizer.version),
                       create_msg("visualizer_append", events)]
     yield mid_seq, continuation_state, None, None, seed, send_msgs(init_msgs)
-    midi_generator = generate(model, mid, max_len=max_len, temp=temp, top_p=top_p, top_k=top_k,
-                              disable_patch_change=disable_patch_change, disable_control_change=not allow_cc,
-                              disable_channels=disable_channels, generator=generator)
-    events = []
-    t = time.time() + 1
-    for i, token_seq in enumerate(midi_generator):
-        token_seq = token_seq.tolist()
-        mid_seq.append(token_seq)
-        events.append(tokenizer.tokens2event(token_seq))
-        ct = time.time()
-        if ct - t > 0.5:
-            yield (mid_seq, continuation_state, None, None, seed,
-                   send_msgs([create_msg("visualizer_append", events),
-                              create_msg("progress", [i + 1, gen_events])]))
-            t = ct
-            events = []
     events = [tokenizer.tokens2event(tokens) for tokens in mid_seq]
     mid = tokenizer.detokenize(mid_seq)
@@ -235,7 +211,7 @@ def run(model_name, tab, mid_seq, continuation_state, instruments, drum_kit, bpm
 def cancel_run(model_name, mid_seq):
     if mid_seq is None:
         return None, None, []
-    tokenizer = models[model_name][2]
     events = [tokenizer.tokens2event(tokens) for tokens in mid_seq]
     mid = tokenizer.detokenize(mid_seq)
     audio = synthesizer.synthesis(MIDI.score2opus(mid))
@@ -248,11 +224,12 @@ def cancel_run(model_name, mid_seq):
     return "output.mid", (44100, audio), send_msgs(end_msgs)
-def undo_continuation(mid_seq, continuation_state):
     if mid_seq is None or len(continuation_state) < 2:
         return mid_seq, continuation_state, send_msgs([])
     mid_seq = mid_seq[:continuation_state[-1]]
     continuation_state = continuation_state[:-1]
     events = [tokenizer.tokens2event(tokens) for tokens in mid_seq]
     end_msgs = [create_msg("visualizer_clear", tokenizer.version),
                 create_msg("visualizer_append", events),
@@ -293,21 +270,6 @@ def hf_hub_download_retry(repo_id, filename):
         raise err
-def get_tokenizer(config_name):
-    tv, size = config_name.split("-")
-    tv = tv[1:]
-    if tv[-1] == "o":
-        o = True
-        tv = tv[:-1]
-    else:
-        o = False
-    if tv not in ["v1", "v2"]:
-        raise ValueError(f"Unknown tokenizer version {tv}")
-    tokenizer = MIDITokenizer(tv)
-    tokenizer.set_optimise_midi(o)
-    return tokenizer
 number2drum_kits = {-1: "None", 0: "Standard", 8: "Room", 16: "Power", 24: "Electric", 25: "TR-808", 32: "Jazz",
                     40: "Blush", 48: "Orchestra"}
 patch2number = {v: k for k, v in MIDI.Number2patch.items()}
@@ -319,6 +281,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
     parser.add_argument("--port", type=int, default=7860, help="gradio server port")
     parser.add_argument("--max-gen", type=int, default=1024, help="max")
     opt = parser.parse_args()
     soundfont_path = hf_hub_download_retry(repo_id="skytnt/midi-model", filename="soundfont.sf2")
@@ -331,14 +294,17 @@ if __name__ == "__main__":
                    "touhou finetune model (tv1-medium) by skytnt": ["skytnt/midi-model-ft", "touhou/", "tv1-medium"],
                    }
     models = {}
-    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
     for name, (repo_id, path, config) in models_info.items():
-        model_base_path = hf_hub_download_retry(repo_id=repo_id, filename=f"{path}onnx/model_base.onnx")
-        model_token_path = hf_hub_download_retry(repo_id=repo_id, filename=f"{path}onnx/model_token.onnx")
-        model_base = rt.InferenceSession(model_base_path, providers=providers)
-        model_token = rt.InferenceSession(model_token_path, providers=providers)
-        tokenizer = get_tokenizer(config)
-        models[name] = [model_base, model_token, tokenizer]
     load_javascript()
     app = gr.Blocks()
@@ -447,6 +413,6 @@ if __name__ == "__main__":
         stop_btn.click(cancel_run, [input_model, output_midi_seq],
                        [output_midi, output_audio, js_msg],
                        cancels=run_event, queue=False)
-        undo_btn.click(undo_continuation, [output_midi_seq, output_continuation_state],
                        [output_midi_seq, output_continuation_state, js_msg], queue=False)
     app.launch(server_port=opt.port, share=opt.share, inbrowser=True)

 import argparse
 import glob
 import json
+import os
 import time
 import gradio as gr
 import numpy as np
+import torch
+import torch.nn.functional as F
 import tqdm
 from huggingface_hub import hf_hub_download
 import MIDI
+from midi_model import MIDIModel, MIDIModelConfig
 from midi_synthesizer import MidiSynthesizer
 MAX_SEED = np.iinfo(np.int32).max
 in_space = os.getenv("SYSTEM") == "spaces"
+@torch.inference_mode()
+def generate(model: MIDIModel, prompt=None, max_len=512, temp=1.0, top_p=0.98, top_k=20,
              disable_patch_change=False, disable_control_change=False, disable_channels=None, generator=None):
+    tokenizer = model.tokenizer
     if disable_channels is not None:
         disable_channels = [tokenizer.parameter_ids["channel"][c] for c in disable_channels]
     else:
         disable_channels = []
     max_token_seq = tokenizer.max_token_seq
     if prompt is None:
+        input_tensor = torch.full((1, max_token_seq), tokenizer.pad_id, dtype=torch.long, device=model.device)
         input_tensor[0, 0] = tokenizer.bos_id  # bos
     else:
         prompt = prompt[:, :max_token_seq]
         if prompt.shape[-1] < max_token_seq:
             prompt = np.pad(prompt, ((0, 0), (0, max_token_seq - prompt.shape[-1])),
                             mode="constant", constant_values=tokenizer.pad_id)
+        input_tensor = torch.from_numpy(prompt).to(dtype=torch.long, device=model.device)
+    input_tensor = input_tensor.unsqueeze(0)
     cur_len = input_tensor.shape[1]
+    bar = tqdm.tqdm(desc="generating", total=max_len - cur_len)
     with bar:
         while cur_len < max_len:
             end = False
+            hidden = model.forward(input_tensor)[0, -1].unsqueeze(0)
+            next_token_seq = None
             event_name = ""
             for i in range(max_token_seq):
+                mask = torch.zeros(tokenizer.vocab_size, dtype=torch.int64, device=model.device)
                 if i == 0:
                     mask_ids = list(tokenizer.event_ids.values()) + [tokenizer.eos_id]
                     if disable_patch_change:
                     if param_name == "channel":
                         mask_ids = [i for i in mask_ids if i not in disable_channels]
                     mask[mask_ids] = 1
+                logits = model.forward_token(hidden, next_token_seq)[:, -1:]
+                scores = torch.softmax(logits / temp, dim=-1) * mask
+                sample = model.sample_top_p_k(scores, top_p, top_k, generator=generator)
                 if i == 0:
                     next_token_seq = sample
                     eid = sample.item()
                         break
                     event_name = tokenizer.id_events[eid]
                 else:
+                    next_token_seq = torch.cat([next_token_seq, sample], dim=1)
                     if len(tokenizer.events[event_name]) == i:
                         break
             if next_token_seq.shape[1] < max_token_seq:
+                next_token_seq = F.pad(next_token_seq, (0, max_token_seq - next_token_seq.shape[1]),
+                                       "constant", value=tokenizer.pad_id)
+            next_token_seq = next_token_seq.unsqueeze(1)
+            input_tensor = torch.cat([input_tensor, next_token_seq], dim=1)
             cur_len += 1
             bar.update(1)
+            yield next_token_seq.reshape(-1).cpu().numpy()
             if end:
                 break
         reduce_cc_st, remap_track_channel, add_default_instr, remove_empty_channels, seed, seed_rand,
         gen_events, temp, top_p, top_k, allow_cc):
     model = models[model_name]
+    tokenizer = model.tokenizer
     bpm = int(bpm)
     if time_sig == "auto":
         time_sig = None
     max_len = gen_events
     if seed_rand:
         seed = np.random.randint(0, MAX_SEED)
+    generator = torch.Generator(opt.device).manual_seed(seed)
     disable_patch_change = False
     disable_channels = None
     if tab == 0:
         init_msgs += [create_msg("visualizer_clear", tokenizer.version),
                       create_msg("visualizer_append", events)]
     yield mid_seq, continuation_state, None, None, seed, send_msgs(init_msgs)
+    ctx = torch.amp.autocast(device_type=opt.device, dtype=torch.bfloat16, enabled=opt.device != "cpu")
+    with ctx:
+        midi_generator = generate(model, mid, max_len=max_len, temp=temp, top_p=top_p, top_k=top_k,
+                                  disable_patch_change=disable_patch_change, disable_control_change=not allow_cc,
+                                  disable_channels=disable_channels, generator=generator)
+        events = []
+        t = time.time() + 1
+        for i, token_seq in enumerate(midi_generator):
+            token_seq = token_seq.tolist()
+            mid_seq.append(token_seq)
+            events.append(tokenizer.tokens2event(token_seq))
+            ct = time.time()
+            if ct - t > 0.5:
+                yield (mid_seq, continuation_state, None, None, seed,
+                       send_msgs([create_msg("visualizer_append", events),
+                                  create_msg("progress", [i + 1, gen_events])]))
+                t = ct
+                events = []
     events = [tokenizer.tokens2event(tokens) for tokens in mid_seq]
     mid = tokenizer.detokenize(mid_seq)
 def cancel_run(model_name, mid_seq):
     if mid_seq is None:
         return None, None, []
+    tokenizer = models[model_name].tokenizer
     events = [tokenizer.tokens2event(tokens) for tokens in mid_seq]
     mid = tokenizer.detokenize(mid_seq)
     audio = synthesizer.synthesis(MIDI.score2opus(mid))
     return "output.mid", (44100, audio), send_msgs(end_msgs)
+def undo_continuation(model_name, mid_seq, continuation_state):
     if mid_seq is None or len(continuation_state) < 2:
         return mid_seq, continuation_state, send_msgs([])
     mid_seq = mid_seq[:continuation_state[-1]]
     continuation_state = continuation_state[:-1]
+    tokenizer = models[model_name].tokenizer
     events = [tokenizer.tokens2event(tokens) for tokens in mid_seq]
     end_msgs = [create_msg("visualizer_clear", tokenizer.version),
                 create_msg("visualizer_append", events),
         raise err
 number2drum_kits = {-1: "None", 0: "Standard", 8: "Room", 16: "Power", 24: "Electric", 25: "TR-808", 32: "Jazz",
                     40: "Blush", 48: "Orchestra"}
 patch2number = {v: k for k, v in MIDI.Number2patch.items()}
     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
     parser.add_argument("--port", type=int, default=7860, help="gradio server port")
+    parser.add_argument("--device", type=str, default="cuda", help="device to run model")
     parser.add_argument("--max-gen", type=int, default=1024, help="max")
     opt = parser.parse_args()
     soundfont_path = hf_hub_download_retry(repo_id="skytnt/midi-model", filename="soundfont.sf2")
                    "touhou finetune model (tv1-medium) by skytnt": ["skytnt/midi-model-ft", "touhou/", "tv1-medium"],
                    }
     models = {}
+    if opt.device == "cuda":
+        torch.backends.cuda.enable_mem_efficient_sdp(True)
+        torch.backends.cuda.enable_flash_sdp(True)
     for name, (repo_id, path, config) in models_info.items():
+        model_path = hf_hub_download_retry(repo_id=repo_id, filename=f"{path}model.ckpt")
+        model = MIDIModel(config=MIDIModelConfig.from_name(config))
+        ckpt = torch.load(model_path, map_location="cpu")
+        state_dict = ckpt.get("state_dict", ckpt)
+        model.load_state_dict(state_dict, strict=False)
+        model.to(device=opt.device, dtype=torch.bfloat16 if opt.device == "cuda" else torch.float32).eval()
+        models[name] = model
     load_javascript()
     app = gr.Blocks()
         stop_btn.click(cancel_run, [input_model, output_midi_seq],
                        [output_midi, output_audio, js_msg],
                        cancels=run_event, queue=False)
+        undo_btn.click(undo_continuation, [input_model, output_midi_seq, output_continuation_state],
                        [output_midi_seq, output_continuation_state, js_msg], queue=False)
     app.launch(server_port=opt.port, share=opt.share, inbrowser=True)

midi_model.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import numpy as np
 import torch
 import torch.nn as nn
@@ -5,23 +7,61 @@ import torch.nn.functional as F
 import tqdm
 from transformers import LlamaModel, LlamaConfig
-from midi_tokenizer import MIDITokenizer
 class MIDIModel(nn.Module):
-    def __init__(self, tokenizer: MIDITokenizer, n_layer=12, n_head=16, n_embd=1024, n_inner=4096,
-                 *args, **kwargs):
         super(MIDIModel, self).__init__()
-        self.tokenizer = tokenizer
-        self.net = LlamaModel(LlamaConfig(vocab_size=tokenizer.vocab_size,
-                                          hidden_size=n_embd, num_attention_heads=n_head,
-                                          num_hidden_layers=n_layer, intermediate_size=n_inner,
-                                          pad_token_id=tokenizer.pad_id, max_position_embeddings=4096))
-        self.net_token = LlamaModel(LlamaConfig(vocab_size=tokenizer.vocab_size,
-                                                hidden_size=n_embd, num_attention_heads=n_head // 4,
-                                                num_hidden_layers=n_layer // 4, intermediate_size=n_inner // 4,
-                                                pad_token_id=tokenizer.pad_id, max_position_embeddings=4096))
-        self.lm_head = nn.Linear(n_embd, tokenizer.vocab_size, bias=False)
         self.device = "cpu"
     def to(self, *args, **kwargs):
@@ -71,7 +111,7 @@ class MIDIModel(nn.Module):
         return next_token
     @torch.inference_mode()
-    def generate(self, prompt=None, max_len=512, temp=1.0, top_p=0.98, top_k=20, amp=True, generator=None):
         tokenizer = self.tokenizer
         max_token_seq = tokenizer.max_token_seq
         if prompt is None:
@@ -86,7 +126,7 @@ class MIDIModel(nn.Module):
         input_tensor = input_tensor.unsqueeze(0)
         cur_len = input_tensor.shape[1]
         bar = tqdm.tqdm(desc="generating", total=max_len - cur_len)
-        with bar, torch.cuda.amp.autocast(enabled=amp):
             while cur_len < max_len:
                 end = False
                 hidden = self.forward(input_tensor)[0, -1].unsqueeze(0)
@@ -123,4 +163,4 @@ class MIDIModel(nn.Module):
                 bar.update(1)
                 if end:
                     break
-        return input_tensor[0].cpu().numpy()

+from typing import Union
 import numpy as np
 import torch
 import torch.nn as nn
 import tqdm
 from transformers import LlamaModel, LlamaConfig
+from midi_tokenizer import MIDITokenizerV1, MIDITokenizerV2, MIDITokenizer
+config_name_list = ["tv1-medium", "tv2-medium", "tv2o-medium", "tv2-large", "tv2o-large"]
+class MIDIModelConfig:
+    def __init__(self, tokenizer: Union[MIDITokenizerV1, MIDITokenizerV2],
+                 net_config: LlamaConfig, net_token_config: LlamaConfig):
+        self.tokenizer = tokenizer
+        self.net_config = net_config
+        self.net_token_config = net_token_config
+        self.n_embd = net_token_config.hidden_size
+    @staticmethod
+    def get_config(tokenizer_ver="v2", optimise_midi=True, n_layer=12, n_head=16, n_embd=1024, n_inner=4096):
+        tokenizer = MIDITokenizer(tokenizer_ver)
+        tokenizer.set_optimise_midi(optimise_midi)
+        net_config = LlamaConfig(vocab_size=tokenizer.vocab_size,
+                                 hidden_size=n_embd, num_attention_heads=n_head,
+                                 num_hidden_layers=n_layer, intermediate_size=n_inner,
+                                 pad_token_id=tokenizer.pad_id, max_position_embeddings=4096)
+        net_token_config = LlamaConfig(vocab_size=tokenizer.vocab_size,
+                                       hidden_size=n_embd, num_attention_heads=n_head // 4,
+                                       num_hidden_layers=n_layer // 4, intermediate_size=n_inner // 4,
+                                       pad_token_id=tokenizer.pad_id, max_position_embeddings=4096)
+        return MIDIModelConfig(tokenizer, net_config, net_token_config)
+    @staticmethod
+    def from_name(name="tv2o-medium"):
+        tv, size = name.split("-")
+        tv = tv[1:]
+        if tv[-1] == "o":
+            o = True
+            tv = tv[:-1]
+        else:
+            o = False
+        if tv not in ["v1", "v2"]:
+            raise ValueError(f"Unknown tokenizer version {tv}")
+        if size == "medium":
+            return MIDIModelConfig.get_config(tokenizer_ver=tv, optimise_midi=o,
+                                              n_layer=12, n_head=16, n_embd=1024, n_inner=4096)
+        elif size == "large":
+            return MIDIModelConfig.get_config(tokenizer_ver=tv, optimise_midi=o,
+                                              n_layer=24, n_head=16, n_embd=1024, n_inner=4096)
+        else:
+            raise ValueError(f"Unknown model size {size}")
 class MIDIModel(nn.Module):
+    def __init__(self, config: MIDIModelConfig, *args, **kwargs):
         super(MIDIModel, self).__init__()
+        self.tokenizer = config.tokenizer
+        self.net = LlamaModel(config.net_config)
+        self.net_token = LlamaModel(config.net_token_config)
+        self.lm_head = nn.Linear(config.n_embd, self.tokenizer.vocab_size, bias=False)
         self.device = "cpu"
     def to(self, *args, **kwargs):
         return next_token
     @torch.inference_mode()
+    def generate(self, prompt=None, max_len=512, temp=1.0, top_p=0.98, top_k=20, generator=None):
         tokenizer = self.tokenizer
         max_token_seq = tokenizer.max_token_seq
         if prompt is None:
         input_tensor = input_tensor.unsqueeze(0)
         cur_len = input_tensor.shape[1]
         bar = tqdm.tqdm(desc="generating", total=max_len - cur_len)
+        with bar:
             while cur_len < max_len:
                 end = False
                 hidden = self.forward(input_tensor)[0, -1].unsqueeze(0)
                 bar.update(1)
                 if end:
                     break
+        return input_tensor[0].cpu().numpy()

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
 Pillow
 numpy
-onnxruntime-gpu
 gradio==4.43.0
 pyfluidsynth
 tqdm

+--extra-index-url https://download.pytorch.org/whl/cu124
 Pillow
 numpy
+torch
+transformers>=4.36
 gradio==4.43.0
 pyfluidsynth
 tqdm