multi_parler_tts

Running on Zero

App Files Files Community

PHBJT commited on Oct 30, 2024

Commit

80cff80

verified ·

1 Parent(s): 44bb8b9

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -26

app.py CHANGED Viewed

@@ -11,10 +11,13 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 # Device setup
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# SmolLM Instruct setup
-checkpoint = "HuggingFaceTB/SmolLM-360M-Instruct"
-smol_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-smol_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
 # Original model setup
 repo_id = "ylacombe/p-m-e"
@@ -50,39 +53,28 @@ def format_description(raw_description, do_format=True):
         return raw_description
     messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant that formats voice descriptions precisely according to the template provided."
-    }, {
-        "role": "user",
         "content": f"""Format this voice description exactly as:
 "a [gender] with a [pitch] voice speaks [speed] in a [environment], [delivery style]"
 Required format:
-- gender: man/woman
-- pitch: slightly low-pitched/moderate pitch/high-pitched
-- speed: slowly/moderately/quickly
-- environment: close-sounding and clear/distant-sounding and noisy
-- delivery style: with monotone delivery/with animated delivery
-Input description: {raw_description}
 Return only the formatted description, nothing else."""
     }]
-    input_text = smol_tokenizer.apply_chat_template(messages, tokenize=False)
-    inputs = smol_tokenizer.encode(input_text, return_tensors="pt").to(device)
-    outputs = smol_model.generate(
-        inputs,
-        max_new_tokens=100,
-        temperature=0.2,
-        top_p=0.9,
-        do_sample=True
-    )
-    formatted = smol_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract just the formatted description
     if "a woman" in formatted.lower() or "a man" in formatted.lower():
-        return formatted.strip()
     return raw_description
 def preprocess(text):

 # Device setup
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# Gemma setup
+pipe = pipeline(
+    "text-generation",
+    model="google/gemma-2-2b-it",
+    model_kwargs={"torch_dtype": torch.bfloat16},
+    device=device
+)
 # Original model setup
 repo_id = "ylacombe/p-m-e"
         return raw_description
     messages = [{
+        "role": "user",
         "content": f"""Format this voice description exactly as:
 "a [gender] with a [pitch] voice speaks [speed] in a [environment], [delivery style]"
 Required format:
+- gender must be: man/woman
+- pitch must be: slightly low-pitched/moderate pitch/high-pitched
+- speed must be: slowly/moderately/quickly
+- environment must be: close-sounding and clear/distant-sounding and noisy
+- delivery style must be: with monotone delivery/with animated delivery
+Input: {raw_description}
 Return only the formatted description, nothing else."""
     }]
+    outputs = pipe(messages, max_new_tokens=100)
+    formatted = outputs[0]["generated_text"][-1]["content"].strip()
+    # Validate and extract formatted description
     if "a woman" in formatted.lower() or "a man" in formatted.lower():
+        return formatted
     return raw_description
 def preprocess(text):