Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,64 +1,101 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from
|
|
|
|
| 3 |
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
"""
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
def
|
| 11 |
-
|
| 12 |
-
history: list[tuple[str, str]],
|
| 13 |
-
system_message,
|
| 14 |
-
max_tokens,
|
| 15 |
-
temperature,
|
| 16 |
-
top_p,
|
| 17 |
-
):
|
| 18 |
-
messages = [{"role": "system", "content": system_message}]
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
|
|
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
gr.Textbox(
|
| 50 |
-
gr.
|
| 51 |
-
gr.
|
| 52 |
-
gr.
|
| 53 |
-
minimum=0.1,
|
| 54 |
-
maximum=1.0,
|
| 55 |
-
value=0.95,
|
| 56 |
-
step=0.05,
|
| 57 |
-
label="Top-p (nucleus sampling)",
|
| 58 |
-
),
|
| 59 |
],
|
|
|
|
|
|
|
| 60 |
)
|
| 61 |
|
| 62 |
-
|
| 63 |
-
if __name__ == "__main__":
|
| 64 |
-
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 3 |
+
import os
|
| 4 |
|
| 5 |
+
LANGUAGES = ["en", "de", "es", "fr", "it", "nl", "sv", "pt"]
|
| 6 |
+
DOMAINS = {
|
| 7 |
+
"Asset management": "am",
|
| 8 |
+
"Annual report": "ar",
|
| 9 |
+
"Corporate action": "corporateAction",
|
| 10 |
+
"Equity research": "equi",
|
| 11 |
+
"Fund fact sheet": "ffs",
|
| 12 |
+
"Kiid": "kiid",
|
| 13 |
+
"Life insurance": "lifeInsurance",
|
| 14 |
+
"Regulatory": "regulatory",
|
| 15 |
+
"General": "general",
|
| 16 |
+
}
|
| 17 |
|
| 18 |
+
# Helper functions
|
| 19 |
+
def language_token(lang):
|
| 20 |
+
return f"<lang_{lang}>"
|
| 21 |
|
| 22 |
+
def domain_token(dom):
|
| 23 |
+
return f"<dom_{dom}>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
def format_input(src, tgt_lang, src_lang, domain):
|
| 26 |
+
assert tgt_lang in LANGUAGES
|
| 27 |
+
tgt_lang_token = language_token(tgt_lang)
|
| 28 |
+
# Prefix the input with <eos>
|
| 29 |
+
base_input = f"<eos>{src}</src>{tgt_lang_token}"
|
| 30 |
+
if src_lang:
|
| 31 |
+
assert src_lang in LANGUAGES
|
| 32 |
+
src_lang_token = language_token(src_lang)
|
| 33 |
+
base_input = f"{base_input}{src_lang_token}"
|
| 34 |
+
if domain:
|
| 35 |
+
domain = DOMAINS.get(domain, "general")
|
| 36 |
+
dom_token = domain_token(domain)
|
| 37 |
+
base_input = f"{base_input}{dom_token}"
|
| 38 |
+
return base_input
|
| 39 |
|
| 40 |
+
# Initialize model and tokenizer globally to avoid reloading
|
| 41 |
+
model_id = "LinguaCustodia/multilingual-multidomain-fin-mt-70M"
|
| 42 |
+
auth_token = os.environ.get("TOKEN") or True
|
| 43 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=auth_token)
|
| 44 |
+
model = AutoModelForCausalLM.from_pretrained(model_id)
|
| 45 |
|
| 46 |
+
def translate(text, source_lang, target_lang, domain):
|
| 47 |
+
if not text:
|
| 48 |
+
return ""
|
| 49 |
+
|
| 50 |
+
src_lang_code = language_map.get(source_lang)
|
| 51 |
+
tgt_lang_code = language_map.get(target_lang)
|
| 52 |
+
|
| 53 |
+
formatted_sentence = format_input(text, tgt_lang_code, src_lang_code, domain)
|
| 54 |
+
inputs = tokenizer(formatted_sentence, return_tensors="pt", return_token_type_ids=False)
|
| 55 |
+
|
| 56 |
+
outputs = model.generate(**inputs, max_new_tokens=256)
|
| 57 |
+
|
| 58 |
+
input_size = inputs["input_ids"].size(1)
|
| 59 |
+
translated_sentence = tokenizer.decode(
|
| 60 |
+
outputs[0, input_size:], skip_special_tokens=True
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return translated_sentence
|
| 64 |
|
| 65 |
+
language_map = {
|
| 66 |
+
"English": "en",
|
| 67 |
+
"German": "de",
|
| 68 |
+
"Spanish": "es",
|
| 69 |
+
"French": "fr",
|
| 70 |
+
"Italian": "it",
|
| 71 |
+
"Dutch": "nl",
|
| 72 |
+
"Swedish": "sv",
|
| 73 |
+
"Portuguese": "pt"
|
| 74 |
+
}
|
| 75 |
|
| 76 |
+
title = "🌐 Multilingual Multidomain Financial Translator 🌐"
|
| 77 |
+
description = """<p><center>Specialized Translation for Financial Documents across 8 Languages and 9 Domains</center></p>"""
|
| 78 |
+
article = """<p style='text-align: center'>Model: <a href='https://huggingface.co/LinguaCustodia/multilingual-multidomain-fin-mt-70M' target='_blank'>LinguaCustodia/multilingual-multidomain-fin-mt-70M</a></p>"""
|
| 79 |
|
| 80 |
+
examples = [
|
| 81 |
+
["Nous avons enregistré une croissance du chiffre d'affaires de 5,7% au troisième trimestre.", "French", "English", "Annual report"],
|
| 82 |
+
["The funds under management increased by €2.3 billion during the fiscal year.", "English", "Spanish", "Asset management"],
|
| 83 |
+
["Der Aufsichtsrat hat den Jahresabschluss geprüft und genehmigt.", "German", "French", "Regulatory"]
|
| 84 |
+
]
|
| 85 |
|
| 86 |
+
demo = gr.Interface(
|
| 87 |
+
fn=translate,
|
| 88 |
+
title=title,
|
| 89 |
+
description=description,
|
| 90 |
+
article=article,
|
| 91 |
+
inputs=[
|
| 92 |
+
gr.Textbox(lines=5, placeholder="Enter text to translate (maximum 5 lines)", label="Input Text"),
|
| 93 |
+
gr.Dropdown(choices=list(language_map.keys()), value="French", label="Source Language"),
|
| 94 |
+
gr.Dropdown(choices=list(language_map.keys()), value="English", label="Target Language"),
|
| 95 |
+
gr.Dropdown(choices=list(DOMAINS.keys()), value="General", label="Financial Domain"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
],
|
| 97 |
+
outputs=gr.Textbox(label="Translation"),
|
| 98 |
+
examples=examples
|
| 99 |
)
|
| 100 |
|
| 101 |
+
demo.launch(enable_queue=True)
|
|
|
|
|
|