Spaces:
Build error
Build error
Commit
·
e16eaa7
1
Parent(s):
b1f485a
app update
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
# OCR Translate v0.1
|
| 2 |
# 创建人:曾逸夫
|
| 3 |
# 创建时间:2022-06-14
|
| 4 |
-
# email: zyfiy1314@163.com
|
| 5 |
|
| 6 |
import os
|
| 7 |
|
|
@@ -13,13 +12,6 @@ from transformers import MarianMTModel, MarianTokenizer
|
|
| 13 |
|
| 14 |
nltk.download('punkt')
|
| 15 |
|
| 16 |
-
# ----------- 翻译 -----------
|
| 17 |
-
# https://huggingface.co/Helsinki-NLP/opus-mt-en-zh
|
| 18 |
-
modchoice = "Helsinki-NLP/opus-mt-en-zh" # 模型名称
|
| 19 |
-
|
| 20 |
-
tokenizer = MarianTokenizer.from_pretrained(modchoice) # 分词器
|
| 21 |
-
model = MarianMTModel.from_pretrained(modchoice) # 模型
|
| 22 |
-
|
| 23 |
OCR_TR_DESCRIPTION = '''# OCR Translate v0.1
|
| 24 |
<div id="content_align">基于Tesseract的OCR翻译系统</div>'''
|
| 25 |
|
|
@@ -30,6 +22,17 @@ img_dir = "./data"
|
|
| 30 |
choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# tesseract语言列表转pytesseract语言
|
| 34 |
def ocr_lang(lang_list):
|
| 35 |
lang_str = ""
|
|
@@ -66,11 +69,19 @@ def translate(input_text):
|
|
| 66 |
if input_text is None or input_text == "":
|
| 67 |
return "系统提示:没有可翻译的内容!"
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
translate_text = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
return translate_text
|
| 74 |
|
| 75 |
|
| 76 |
def main():
|
|
|
|
| 1 |
# OCR Translate v0.1
|
| 2 |
# 创建人:曾逸夫
|
| 3 |
# 创建时间:2022-06-14
|
|
|
|
| 4 |
|
| 5 |
import os
|
| 6 |
|
|
|
|
| 12 |
|
| 13 |
nltk.download('punkt')
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
OCR_TR_DESCRIPTION = '''# OCR Translate v0.1
|
| 16 |
<div id="content_align">基于Tesseract的OCR翻译系统</div>'''
|
| 17 |
|
|
|
|
| 22 |
choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
| 23 |
|
| 24 |
|
| 25 |
+
# 翻译模型选择
|
| 26 |
+
def model_choice(src="en", trg="zh"):
|
| 27 |
+
# https://huggingface.co/Helsinki-NLP/opus-mt-en-zh
|
| 28 |
+
model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}" # 模型名称
|
| 29 |
+
|
| 30 |
+
tokenizer = MarianTokenizer.from_pretrained(model_name) # 分词器
|
| 31 |
+
model = MarianMTModel.from_pretrained(model_name) # 模型
|
| 32 |
+
|
| 33 |
+
return tokenizer, model
|
| 34 |
+
|
| 35 |
+
|
| 36 |
# tesseract语言列表转pytesseract语言
|
| 37 |
def ocr_lang(lang_list):
|
| 38 |
lang_str = ""
|
|
|
|
| 69 |
if input_text is None or input_text == "":
|
| 70 |
return "系统提示:没有可翻译的内容!"
|
| 71 |
|
| 72 |
+
tokenizer, model = model_choice()
|
| 73 |
+
|
| 74 |
+
translate_text = ""
|
| 75 |
+
input_text_list = input_text.split("\n\n")
|
| 76 |
+
|
| 77 |
+
for i in range(len(input_text_list)):
|
| 78 |
+
translated_sub = model.generate(
|
| 79 |
+
**tokenizer(sent_tokenize(input_text_list[i]), return_tensors="pt", truncation=True, padding=True))
|
| 80 |
+
tgt_text_sub = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_sub]
|
| 81 |
+
translate_text_sub = "".join(tgt_text_sub)
|
| 82 |
+
translate_text = translate_text + "\n\n" + translate_text_sub
|
| 83 |
|
| 84 |
+
return translate_text[2:]
|
| 85 |
|
| 86 |
|
| 87 |
def main():
|