Spaces:
Build error
Build error
Commit
·
56a498b
1
Parent(s):
13beaa4
añadidos cambios article
Browse files- app.py +2 -12
- article_app.py +37 -13
app.py
CHANGED
|
@@ -36,14 +36,6 @@ models = {
|
|
| 36 |
"IIC/wav2vec2-spanish-multilibrispeech"
|
| 37 |
),
|
| 38 |
},
|
| 39 |
-
# "wav2vec2-jonatangrosman": {
|
| 40 |
-
# "processor": Wav2Vec2Tokenizer.from_pretrained(
|
| 41 |
-
# "jonatasgrosman/wav2vec2-large-xlsr-53-spanish"
|
| 42 |
-
# ),
|
| 43 |
-
# "model": AutoModelForCTC.from_pretrained(
|
| 44 |
-
# "jonatasgrosman/wav2vec2-large-xlsr-53-spanish"
|
| 45 |
-
# ),
|
| 46 |
-
# },
|
| 47 |
}
|
| 48 |
|
| 49 |
|
|
@@ -80,7 +72,7 @@ similarity_model = SentenceTransformer(
|
|
| 80 |
"distiluse-base-multilingual-cased", device="cpu"
|
| 81 |
)
|
| 82 |
|
| 83 |
-
crossencoder = CrossEncoder("
|
| 84 |
|
| 85 |
dataset = load_dataset("IIC/spanish_biomedical_crawled_corpus", split="train")
|
| 86 |
|
|
@@ -228,7 +220,7 @@ if __name__ == "__main__":
|
|
| 228 |
step=1,
|
| 229 |
),
|
| 230 |
gr.inputs.Dropdown(
|
| 231 |
-
["wav2vec2-iic"
|
| 232 |
type="value",
|
| 233 |
default=None,
|
| 234 |
label="Select the speech recognition model.",
|
|
@@ -239,12 +231,10 @@ if __name__ == "__main__":
|
|
| 239 |
],
|
| 240 |
outputs=[
|
| 241 |
gr.outputs.HTML(
|
| 242 |
-
# type="str",
|
| 243 |
label="Answer from the system."
|
| 244 |
),
|
| 245 |
gr.outputs.Audio(label="Answer in audio"),
|
| 246 |
],
|
| 247 |
-
# title="Abstractive QA of BioMedical Domain in Spanish",
|
| 248 |
description=description,
|
| 249 |
examples=examples,
|
| 250 |
theme="grass",
|
|
|
|
| 36 |
"IIC/wav2vec2-spanish-multilibrispeech"
|
| 37 |
),
|
| 38 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
|
| 41 |
|
|
|
|
| 72 |
"distiluse-base-multilingual-cased", device="cpu"
|
| 73 |
)
|
| 74 |
|
| 75 |
+
crossencoder = CrossEncoder("IIC/roberta-base-bne-ranker", device="cpu")
|
| 76 |
|
| 77 |
dataset = load_dataset("IIC/spanish_biomedical_crawled_corpus", split="train")
|
| 78 |
|
|
|
|
| 220 |
step=1,
|
| 221 |
),
|
| 222 |
gr.inputs.Dropdown(
|
| 223 |
+
["wav2vec2-iic"],
|
| 224 |
type="value",
|
| 225 |
default=None,
|
| 226 |
label="Select the speech recognition model.",
|
|
|
|
| 231 |
],
|
| 232 |
outputs=[
|
| 233 |
gr.outputs.HTML(
|
|
|
|
| 234 |
label="Answer from the system."
|
| 235 |
),
|
| 236 |
gr.outputs.Audio(label="Answer in audio"),
|
| 237 |
],
|
|
|
|
| 238 |
description=description,
|
| 239 |
examples=examples,
|
| 240 |
theme="grass",
|
article_app.py
CHANGED
|
@@ -9,27 +9,27 @@ have been introduced to build this app.
|
|
| 9 |
The reason for including audio as a possible input and always as an output is because we wanted to make the App much more accessible to people that cannot read or write.
|
| 10 |
Below you can find all the pieces that form the system.
|
| 11 |
|
| 12 |
-
1. <a href="https://
|
| 13 |
-
2. <a href="https://
|
| 14 |
that is, the task of getting the most relevant passages to answer a given question with. You can find details about how it was trained on the link attached to the name.
|
| 15 |
-
3. <a href="https://
|
| 16 |
-
4. <a href="https://
|
| 17 |
-
5. <a href="https://
|
| 18 |
passages and uses them to generate an answer to the question. In the attached link there are more details about how we trained it etc.
|
| 19 |
|
| 20 |
On the other hand, we uploaded, and in some cases created, datasets in Spanish to be able to build such a system.
|
| 21 |
|
| 22 |
-
1. <a href="https://
|
| 23 |
-
2. <a href="https://
|
| 24 |
-
3. <a href="https://
|
| 25 |
-
4. <a href="https://
|
| 26 |
-
5. <a href="https://
|
| 27 |
</p>
|
| 28 |
"""
|
| 29 |
-
|
| 30 |
description = """
|
| 31 |
<a href="https://www.iic.uam.es/">
|
| 32 |
-
<img src="https://drive.google.com/uc?export=view&id=
|
| 33 |
</a>
|
| 34 |
<h1> BioMedIA: Abstractive Question Answering of BioMedical Domain in Spanish </h1>
|
| 35 |
Esta aplicación consiste en sistemas de búsqueda del Estado del Arte en Español junto con un modelo generativo entrenado para componer una respuesta a preguntas a partir de una serie de contextos.
|
|
@@ -49,6 +49,30 @@ examples = [
|
|
| 49 |
"wav2vec2-iic",
|
| 50 |
False,
|
| 51 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
[
|
| 53 |
"¿Qué alternativas al Paracetamol existen para el dolor de cabeza?",
|
| 54 |
"vacio.flac",
|
|
@@ -98,7 +122,7 @@ examples = [
|
|
| 98 |
False
|
| 99 |
],
|
| 100 |
[
|
| 101 |
-
"¿Qué deficiencia es la causa del síndrome de piernas inquietas
|
| 102 |
"vacio.flac",
|
| 103 |
"vacio.flac",
|
| 104 |
50,
|
|
|
|
| 9 |
The reason for including audio as a possible input and always as an output is because we wanted to make the App much more accessible to people that cannot read or write.
|
| 10 |
Below you can find all the pieces that form the system.
|
| 11 |
|
| 12 |
+
1. <a href="https://hf.co/IIC/wav2vec2-spanish-multilibrispeech">Speech2Text</a>: For this we finedtuned a multilingual Wav2Vec2, as explained in the attached link. We use this model to process audio questions.
|
| 13 |
+
2. <a href="https://hf.co/IIC/dpr-spanish-passage_encoder-allqa-base">Dense Passage Retrieval for Context</a>: Dense Passage Retrieval is a methodology <a href="https://arxiv.org/abs/2004.04906">developed by Facebook</a> which is currently the SoTA for Passage Retrieval,
|
| 14 |
that is, the task of getting the most relevant passages to answer a given question with. You can find details about how it was trained on the link attached to the name.
|
| 15 |
+
3. <a href="https://hf.co/IIC/dpr-spanish-question_encoder-allqa-base">Dense Passage Retrieval for Question</a>: It is actually part of the same thing as the above. For more details, go to the attached link.
|
| 16 |
+
4. <a href="https://hf.co/sentence-transformers/distiluse-base-multilingual-cased-v1">Sentence Encoder Ranker</a>: To rerank the candidate contexts retrieved by dpr for the generative model to see. This also selects the top 5 passages for the model to read, it is the final filter before the generative model.
|
| 17 |
+
5. <a href="https://hf.co/IIC/mt5-base-lfqa-es">Generative Long-Form Question Answering Model</a>: For this we used either mT5 (the one attached) or <a href="https://hf.co/IIC/mbart-large-lfqa-es">mBART</a>. This generative model receives the most relevant
|
| 18 |
passages and uses them to generate an answer to the question. In the attached link there are more details about how we trained it etc.
|
| 19 |
|
| 20 |
On the other hand, we uploaded, and in some cases created, datasets in Spanish to be able to build such a system.
|
| 21 |
|
| 22 |
+
1. <a href="https://hf.co/datasets/IIC/spanish_biomedical_crawled_corpus">Spanish Biomedical Crawled Corpus</a>. Used for finding answers to questions about biomedicine. (More info in the link.)
|
| 23 |
+
2. <a href="https://hf.co/datasets/IIC/lfqa_spanish">LFQA_Spanish</a>. Used for training the generative model. (More info in the link.)
|
| 24 |
+
3. <a href="https://hf.co/datasets/squad_es">SQUADES</a>. Used to train the DPR models. (More info in the link.)
|
| 25 |
+
4. <a href="https://hf.co/datasets/IIC/bioasq22_es">BioAsq22-Spanish</a>. Used to train the DPR models. (More info in the link.)
|
| 26 |
+
5. <a href="https://hf.co/datasets/PlanTL-GOB-ES/SQAC">SQAC (Spanish Question Answering Corpus)</a>. Used to train the DPR models. (More info in the link.)
|
| 27 |
</p>
|
| 28 |
"""
|
| 29 |
+
|
| 30 |
description = """
|
| 31 |
<a href="https://www.iic.uam.es/">
|
| 32 |
+
<img src="https://drive.google.com/uc?export=view&id=1kvHDFUPPnf1kM5EKlv5Ife2KcZZvva_1" style="max-width: 100%; max-height: 10%; height: 250px; object-fit: fill">,
|
| 33 |
</a>
|
| 34 |
<h1> BioMedIA: Abstractive Question Answering of BioMedical Domain in Spanish </h1>
|
| 35 |
Esta aplicación consiste en sistemas de búsqueda del Estado del Arte en Español junto con un modelo generativo entrenado para componer una respuesta a preguntas a partir de una serie de contextos.
|
|
|
|
| 49 |
"wav2vec2-iic",
|
| 50 |
False,
|
| 51 |
],
|
| 52 |
+
[
|
| 53 |
+
"¿Por qué sentimos ansiedad?",
|
| 54 |
+
"vacio.flac",
|
| 55 |
+
"vacio.flac",
|
| 56 |
+
50,
|
| 57 |
+
8,
|
| 58 |
+
3,
|
| 59 |
+
1.0,
|
| 60 |
+
250,
|
| 61 |
+
"wav2vec2-iic",
|
| 62 |
+
False,
|
| 63 |
+
],
|
| 64 |
+
[
|
| 65 |
+
"¿Qué es la mesoterapia?",
|
| 66 |
+
"vacio.flac",
|
| 67 |
+
"vacio.flac",
|
| 68 |
+
50,
|
| 69 |
+
8,
|
| 70 |
+
3,
|
| 71 |
+
1.0,
|
| 72 |
+
250,
|
| 73 |
+
"wav2vec2-iic",
|
| 74 |
+
False,
|
| 75 |
+
],
|
| 76 |
[
|
| 77 |
"¿Qué alternativas al Paracetamol existen para el dolor de cabeza?",
|
| 78 |
"vacio.flac",
|
|
|
|
| 122 |
False
|
| 123 |
],
|
| 124 |
[
|
| 125 |
+
"¿Qué deficiencia es la causa del síndrome de piernas inquietas?",
|
| 126 |
"vacio.flac",
|
| 127 |
"vacio.flac",
|
| 128 |
50,
|