multi_parler_tts

Running on Zero

App Files Files Community

PHBJT commited on Oct 30, 2024

Commit

03d612a

verified ·

1 Parent(s): d7d8798

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -12

app.py CHANGED Viewed

@@ -11,8 +11,8 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 # Device setup
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# SmolLM setup
-checkpoint = "HuggingFaceTB/SmolLM-360M"
 smol_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 smol_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
@@ -49,7 +49,9 @@ def format_description(raw_description, do_format=True):
     if not do_format:
         return raw_description
-    prompt = f"""Format this voice description to match exactly:
 "a [gender] with a [pitch] voice speaks [speed] in a [environment], [delivery style]"
 Where:
 - gender: man/woman
@@ -57,21 +59,25 @@ Where:
 - speed: slowly/moderately/quickly
 - environment: close-sounding and clear/distant-sounding and noisy
 - delivery style: with monotone delivery/with animated delivery
-Description to format: {raw_description}
-Formatted description:"""
-    inputs = smol_tokenizer.encode(prompt, return_tensors="pt").to(device)
     outputs = smol_model.generate(
         inputs,
-        max_length=200,
-        num_return_sequences=1,
         temperature=0.7,
-        do_sample=True,
-        pad_token_id=smol_tokenizer.eos_token_id
     )
     formatted = smol_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return formatted.split("Formatted description:")[-1].strip()
 def preprocess(text):
     text = number_normalizer(text).strip()
@@ -109,6 +115,7 @@ def gen_tts(text, description, do_format=True):
     audio_arr = generation.cpu().numpy().squeeze()
     return formatted_desc, (SAMPLE_RATE, audio_arr)
 css = """
         #share-btn-container {
             display: flex;

 # Device setup
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# SmolLM Instruct setup
+checkpoint = "HuggingFaceTB/SmolLM-360M-Instruct"
 smol_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 smol_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
     if not do_format:
         return raw_description
+    messages = [{
+        "role": "user",
+        "content": f"""Format this voice description to match exactly:
 "a [gender] with a [pitch] voice speaks [speed] in a [environment], [delivery style]"
 Where:
 - gender: man/woman
 - speed: slowly/moderately/quickly
 - environment: close-sounding and clear/distant-sounding and noisy
 - delivery style: with monotone delivery/with animated delivery
+Description to format: {raw_description}"""
+    }]
+    input_text = smol_tokenizer.apply_chat_template(messages, tokenize=False)
+    inputs = smol_tokenizer.encode(input_text, return_tensors="pt").to(device)
     outputs = smol_model.generate(
         inputs,
+        max_new_tokens=200,
         temperature=0.7,
+        top_p=0.9,
+        do_sample=True
     )
     formatted = smol_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract the formatted description from the response
+    try:
+        return formatted.split("a ")[-1].strip()
+    except:
+        return raw_description
 def preprocess(text):
     text = number_normalizer(text).strip()
     audio_arr = generation.cpu().numpy().squeeze()
     return formatted_desc, (SAMPLE_RATE, audio_arr)
+# Rest of the code remains unchanged
 css = """
         #share-btn-container {
             display: flex;