Spaces:
Runtime error
Runtime error
fix type annotation and add ngram filtering (#7)
Browse files- Fix type annotation and add ngram_filtering (412664701c7f9488addf9005fd9a5a81a0d0179c)
Co-authored-by: hysts <hysts@users.noreply.huggingface.co>
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
-
from typing import Union
|
| 4 |
import gradio as gr
|
| 5 |
import numpy as np
|
| 6 |
import torch
|
|
@@ -49,14 +50,14 @@ translator = Translator(
|
|
| 49 |
def predict(
|
| 50 |
task_name: str,
|
| 51 |
audio_source: str,
|
| 52 |
-
input_audio_mic:
|
| 53 |
-
input_audio_file:
|
| 54 |
-
input_text:
|
| 55 |
-
source_language:
|
| 56 |
target_language: str,
|
| 57 |
-
) -> tuple[
|
| 58 |
task_name = task_name.split()[0]
|
| 59 |
-
source_language_code = LANGUAGE_NAME_TO_CODE
|
| 60 |
target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
|
| 61 |
|
| 62 |
if task_name in ["S2ST", "S2TT", "ASR"]:
|
|
@@ -79,6 +80,7 @@ def predict(
|
|
| 79 |
task_str=task_name,
|
| 80 |
tgt_lang=target_language_code,
|
| 81 |
src_lang=source_language_code,
|
|
|
|
| 82 |
)
|
| 83 |
if task_name in ["S2ST", "T2ST"]:
|
| 84 |
return (sr, wav.cpu().detach().numpy()), text_out
|
|
@@ -86,7 +88,7 @@ def predict(
|
|
| 86 |
return None, text_out
|
| 87 |
|
| 88 |
|
| 89 |
-
def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[
|
| 90 |
return predict(
|
| 91 |
task_name="S2ST",
|
| 92 |
audio_source="file",
|
|
@@ -98,7 +100,7 @@ def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[s
|
|
| 98 |
)
|
| 99 |
|
| 100 |
|
| 101 |
-
def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[
|
| 102 |
return predict(
|
| 103 |
task_name="S2TT",
|
| 104 |
audio_source="file",
|
|
@@ -110,7 +112,9 @@ def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[s
|
|
| 110 |
)
|
| 111 |
|
| 112 |
|
| 113 |
-
def process_t2st_example(
|
|
|
|
|
|
|
| 114 |
return predict(
|
| 115 |
task_name="T2ST",
|
| 116 |
audio_source="",
|
|
@@ -122,7 +126,9 @@ def process_t2st_example(input_text: str, source_language: str, target_language:
|
|
| 122 |
)
|
| 123 |
|
| 124 |
|
| 125 |
-
def process_t2tt_example(
|
|
|
|
|
|
|
| 126 |
return predict(
|
| 127 |
task_name="T2TT",
|
| 128 |
audio_source="",
|
|
@@ -134,7 +140,7 @@ def process_t2tt_example(input_text: str, source_language: str, target_language:
|
|
| 134 |
)
|
| 135 |
|
| 136 |
|
| 137 |
-
def process_asr_example(input_audio_file: str, target_language: str) -> tuple[
|
| 138 |
return predict(
|
| 139 |
task_name="ASR",
|
| 140 |
audio_source="file",
|
|
@@ -317,10 +323,16 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 317 |
examples=[
|
| 318 |
["My favorite animal is the elephant.", "English", "French"],
|
| 319 |
["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
|
| 320 |
-
[
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
],
|
| 325 |
inputs=[input_text, source_language, target_language],
|
| 326 |
outputs=[output_audio, output_text],
|
|
@@ -332,10 +344,16 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 332 |
examples=[
|
| 333 |
["My favorite animal is the elephant.", "English", "French"],
|
| 334 |
["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
|
| 335 |
-
[
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
],
|
| 340 |
inputs=[input_text, source_language, target_language],
|
| 341 |
outputs=[output_audio, output_text],
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
import os
|
| 4 |
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
import numpy as np
|
| 7 |
import torch
|
|
|
|
| 50 |
def predict(
|
| 51 |
task_name: str,
|
| 52 |
audio_source: str,
|
| 53 |
+
input_audio_mic: str | None,
|
| 54 |
+
input_audio_file: str | None,
|
| 55 |
+
input_text: str | None,
|
| 56 |
+
source_language: str | None,
|
| 57 |
target_language: str,
|
| 58 |
+
) -> tuple[tuple[int, np.ndarray] | None, str]:
|
| 59 |
task_name = task_name.split()[0]
|
| 60 |
+
source_language_code = LANGUAGE_NAME_TO_CODE[source_language] if source_language else None
|
| 61 |
target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
|
| 62 |
|
| 63 |
if task_name in ["S2ST", "S2TT", "ASR"]:
|
|
|
|
| 80 |
task_str=task_name,
|
| 81 |
tgt_lang=target_language_code,
|
| 82 |
src_lang=source_language_code,
|
| 83 |
+
ngram_filtering=True,
|
| 84 |
)
|
| 85 |
if task_name in ["S2ST", "T2ST"]:
|
| 86 |
return (sr, wav.cpu().detach().numpy()), text_out
|
|
|
|
| 88 |
return None, text_out
|
| 89 |
|
| 90 |
|
| 91 |
+
def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
|
| 92 |
return predict(
|
| 93 |
task_name="S2ST",
|
| 94 |
audio_source="file",
|
|
|
|
| 100 |
)
|
| 101 |
|
| 102 |
|
| 103 |
+
def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
|
| 104 |
return predict(
|
| 105 |
task_name="S2TT",
|
| 106 |
audio_source="file",
|
|
|
|
| 112 |
)
|
| 113 |
|
| 114 |
|
| 115 |
+
def process_t2st_example(
|
| 116 |
+
input_text: str, source_language: str, target_language: str
|
| 117 |
+
) -> tuple[tuple[int, np.ndarray] | None, str]:
|
| 118 |
return predict(
|
| 119 |
task_name="T2ST",
|
| 120 |
audio_source="",
|
|
|
|
| 126 |
)
|
| 127 |
|
| 128 |
|
| 129 |
+
def process_t2tt_example(
|
| 130 |
+
input_text: str, source_language: str, target_language: str
|
| 131 |
+
) -> tuple[tuple[int, np.ndarray] | None, str]:
|
| 132 |
return predict(
|
| 133 |
task_name="T2TT",
|
| 134 |
audio_source="",
|
|
|
|
| 140 |
)
|
| 141 |
|
| 142 |
|
| 143 |
+
def process_asr_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
|
| 144 |
return predict(
|
| 145 |
task_name="ASR",
|
| 146 |
audio_source="file",
|
|
|
|
| 323 |
examples=[
|
| 324 |
["My favorite animal is the elephant.", "English", "French"],
|
| 325 |
["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
|
| 326 |
+
[
|
| 327 |
+
"Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
|
| 328 |
+
"English",
|
| 329 |
+
"Hindi",
|
| 330 |
+
],
|
| 331 |
+
[
|
| 332 |
+
"Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
|
| 333 |
+
"English",
|
| 334 |
+
"Spanish",
|
| 335 |
+
],
|
| 336 |
],
|
| 337 |
inputs=[input_text, source_language, target_language],
|
| 338 |
outputs=[output_audio, output_text],
|
|
|
|
| 344 |
examples=[
|
| 345 |
["My favorite animal is the elephant.", "English", "French"],
|
| 346 |
["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
|
| 347 |
+
[
|
| 348 |
+
"Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
|
| 349 |
+
"English",
|
| 350 |
+
"Hindi",
|
| 351 |
+
],
|
| 352 |
+
[
|
| 353 |
+
"Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
|
| 354 |
+
"English",
|
| 355 |
+
"Spanish",
|
| 356 |
+
],
|
| 357 |
],
|
| 358 |
inputs=[input_text, source_language, target_language],
|
| 359 |
outputs=[output_audio, output_text],
|