Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
3740b63
1
Parent(s):
2a62da0
use sentence splitters from stopes
Browse filesSigned-off-by: David Dale <daviddale@meta.com>
- app.py +11 -1
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import spaces
|
| 2 |
import gradio as gr
|
| 3 |
from sacremoses import MosesPunctNormalizer
|
|
|
|
| 4 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 5 |
from flores import code_mapping
|
| 6 |
import platform
|
|
@@ -35,6 +36,14 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
| 35 |
|
| 36 |
punct_normalizer = MosesPunctNormalizer(lang="en")
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
# cache function
|
| 39 |
@lru_cache(maxsize=100)
|
| 40 |
def translate(text: str, src_lang: str, tgt_lang: str):
|
|
@@ -60,7 +69,8 @@ def _translate(text: str, src_lang: str, tgt_lang: str):
|
|
| 60 |
translated_paragraphs = []
|
| 61 |
|
| 62 |
for paragraph in paragraphs:
|
| 63 |
-
|
|
|
|
| 64 |
translated_sentences = []
|
| 65 |
|
| 66 |
for sentence in sentences:
|
|
|
|
| 1 |
import spaces
|
| 2 |
import gradio as gr
|
| 3 |
from sacremoses import MosesPunctNormalizer
|
| 4 |
+
from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo
|
| 5 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 6 |
from flores import code_mapping
|
| 7 |
import platform
|
|
|
|
| 36 |
|
| 37 |
punct_normalizer = MosesPunctNormalizer(lang="en")
|
| 38 |
|
| 39 |
+
|
| 40 |
+
@lru_cache(maxsize=202)
|
| 41 |
+
def get_language_specific_sentence_splitter(language_code):
|
| 42 |
+
short_code = language_code[:3]
|
| 43 |
+
splitter = get_split_algo(short_code, "default")
|
| 44 |
+
return splitter
|
| 45 |
+
|
| 46 |
+
|
| 47 |
# cache function
|
| 48 |
@lru_cache(maxsize=100)
|
| 49 |
def translate(text: str, src_lang: str, tgt_lang: str):
|
|
|
|
| 69 |
translated_paragraphs = []
|
| 70 |
|
| 71 |
for paragraph in paragraphs:
|
| 72 |
+
splitter = get_language_specific_sentence_splitter(src_code)
|
| 73 |
+
sentences = list(splitter(paragraph))
|
| 74 |
translated_sentences = []
|
| 75 |
|
| 76 |
for sentence in sentences:
|
requirements.txt
CHANGED
|
@@ -5,3 +5,4 @@ gradio==4.32.2
|
|
| 5 |
spaces
|
| 6 |
nltk
|
| 7 |
sacremoses
|
|
|
|
|
|
| 5 |
spaces
|
| 6 |
nltk
|
| 7 |
sacremoses
|
| 8 |
+
stopes[mono] @ git+https://github.com/facebookresearch/stopes@better-sentence-splitters
|