mlabonne commited on
Commit
a3c9a3f
·
verified ·
1 Parent(s): 37a3be7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -114
app.py CHANGED
@@ -1,15 +1,13 @@
1
  import subprocess
2
  import sys
3
 
4
- # Install specific versions at runtime
5
- print("Installing dependencies...")
6
  subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "transformers==4.56.2"])
7
- print("Dependencies installed successfully!")
8
 
9
  import logging
10
  from typing import List, Dict, Tuple
11
  import gradio as gr
12
  from pylate import indexes, models, retrieve
 
13
 
14
  # Configure logging
15
  logging.basicConfig(
@@ -19,7 +17,6 @@ logging.basicConfig(
19
  logger = logging.getLogger(__name__)
20
 
21
 
22
-
23
  class CrossLingualRetriever:
24
  """Cross-lingual retrieval system using LiquidAI's LFM2-ColBERT model."""
25
 
@@ -103,101 +100,6 @@ class CrossLingualRetriever:
103
  return results
104
 
105
 
106
- # Multilingual document corpus
107
- MULTILINGUAL_DOCUMENTS = [
108
- {
109
- "id": "en_1",
110
- "language": "English",
111
- "title": "Artificial Intelligence Overview",
112
- "text": "Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self-correction."
113
- },
114
- {
115
- "id": "es_1",
116
- "language": "Spanish",
117
- "title": "Inteligencia Artificial",
118
- "text": "La inteligencia artificial es la simulación de procesos de inteligencia humana por parte de máquinas, especialmente sistemas informáticos. Estos procesos incluyen el aprendizaje, el razonamiento y la autocorrección."
119
- },
120
- {
121
- "id": "fr_1",
122
- "language": "French",
123
- "title": "Intelligence Artificielle",
124
- "text": "L'intelligence artificielle est la simulation des processus d'intelligence humaine par des machines, en particulier des systèmes informatiques. Ces processus comprennent l'apprentissage, le raisonnement et l'autocorrection."
125
- },
126
- {
127
- "id": "de_1",
128
- "language": "German",
129
- "title": "Künstliche Intelligenz",
130
- "text": "Künstliche Intelligenz ist die Simulation menschlicher Intelligenzprozesse durch Maschinen, insbesondere Computersysteme. Diese Prozesse umfassen Lernen, Argumentieren und Selbstkorrektur."
131
- },
132
- {
133
- "id": "en_2",
134
- "language": "English",
135
- "title": "Climate Change Impact",
136
- "text": "Climate change refers to long-term shifts in global temperatures and weather patterns. These shifts may be natural, but since the 1800s, human activities have been the main driver of climate change."
137
- },
138
- {
139
- "id": "es_2",
140
- "language": "Spanish",
141
- "title": "Cambio Climático",
142
- "text": "El cambio climático se refiere a cambios a largo plazo en las temperaturas globales y los patrones climáticos. Estos cambios pueden ser naturales, pero desde el siglo XIX, las actividades humanas han sido el principal impulsor del cambio climático."
143
- },
144
- {
145
- "id": "fr_2",
146
- "language": "French",
147
- "title": "Changement Climatique",
148
- "text": "Le changement climatique fait référence aux changements à long terme des températures mondiales et des conditions météorologiques. Ces changements peuvent être naturels, mais depuis les années 1800, les activités humaines sont le principal moteur du changement climatique."
149
- },
150
- {
151
- "id": "zh_1",
152
- "language": "Chinese",
153
- "title": "人工智能",
154
- "text": "人工智能是机器(尤其是计算机系统)对人类智能过程的模拟。这些过程包括学习、推理和自我纠正。"
155
- },
156
- {
157
- "id": "ja_1",
158
- "language": "Japanese",
159
- "title": "人工知能",
160
- "text": "人工知能とは、機械、特にコンピュータシステムによる人間の知能プロセスのシミュレーションです。これらのプロセスには、学習、推論、自己修正が含まれます。"
161
- },
162
- {
163
- "id": "ar_1",
164
- "language": "Arabic",
165
- "title": "الذكاء الاصطناعي",
166
- "text": "الذكاء الاصطناعي هو محاكاة عمليات الذكاء البشري بواسطة الآلات، وخاصة أنظمة الكمبيوتر. تشمل هذه العمليات التعلم والاستدلال والتصحيح الذاتي."
167
- },
168
- {
169
- "id": "en_3",
170
- "language": "English",
171
- "title": "Renewable Energy Sources",
172
- "text": "Renewable energy comes from natural sources that are constantly replenished, such as sunlight, wind, rain, tides, waves, and geothermal heat. These sources are sustainable and environmentally friendly."
173
- },
174
- {
175
- "id": "de_2",
176
- "language": "German",
177
- "title": "Erneuerbare Energien",
178
- "text": "Erneuerbare Energie stammt aus natürlichen Quellen, die ständig nachgefüllt werden, wie Sonnenlicht, Wind, Regen, Gezeiten, Wellen und geothermische Wärme. Diese Quellen sind nachhaltig und umweltfreundlich."
179
- },
180
- {
181
- "id": "pt_1",
182
- "language": "Portuguese",
183
- "title": "Energia Renovável",
184
- "text": "A energia renovável vem de fontes naturais que são constantemente reabastecidas, como luz solar, vento, chuva, marés, ondas e calor geotérmico. Essas fontes são sustentáveis e ambientalmente amigáveis."
185
- },
186
- {
187
- "id": "it_1",
188
- "language": "Italian",
189
- "title": "Energia Rinnovabile",
190
- "text": "L'energia rinnovabile proviene da fonti naturali che vengono costantemente reintegrate, come la luce solare, il vento, la pioggia, le maree, le onde e il calore geotermico. Queste fonti sono sostenibili ed ecologiche."
191
- },
192
- {
193
- "id": "ru_1",
194
- "language": "Russian",
195
- "title": "Искусственный Интеллект",
196
- "text": "Искусственный интеллект - это имитация процессов человеческого интеллекта машинами, особенно компьютерными системами. Эти процессы включают обучение, рассуждение и самокоррекцию."
197
- },
198
- ]
199
-
200
-
201
  # Initialize retriever and load documents
202
  retriever = CrossLingualRetriever()
203
  retriever.load_documents(MULTILINGUAL_DOCUMENTS)
@@ -261,9 +163,11 @@ def search_documents(query: str, top_k: int) -> Tuple[str, str]:
261
  EXAMPLE_QUERIES = [
262
  ["What is artificial intelligence?", 5],
263
  ["¿Qué es el cambio climático?", 5],
264
- ["Qu'est-ce que l'énergie renouvelable?", 5],
265
- ["人工知能とは何ですか?", 5],
266
- ["Was ist künstliche Intelligenz?", 3],
 
 
267
  ]
268
 
269
 
@@ -272,12 +176,15 @@ with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as
272
  gr.Markdown(
273
  """
274
  # 🌍 Cross-Lingual Document Retrieval
275
- ### Powered by LiquidAI/LFM2-ColBERT-350M
 
 
276
 
277
- This demo showcases **cross-lingual retrieval** - search for documents in any language using queries in any language!
278
  The model finds semantically similar documents regardless of the language mismatch.
279
 
280
- Try searching in English, Spanish, French, German, Chinese, Japanese, Arabic, or any other language!
 
 
281
  """
282
  )
283
 
@@ -302,14 +209,20 @@ with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as
302
  with gr.Column(scale=1):
303
  gr.Markdown(
304
  """
305
- ### 📚 Available Documents
306
 
307
- The corpus contains documents about:
308
- - **Artificial Intelligence**
309
- - **Climate Change**
310
- - **Renewable Energy**
 
 
 
 
 
311
 
312
- In languages: 🇬🇧 🇪🇸 🇫🇷 🇩🇪 🇨🇳 🇯🇵 🇸🇦 🇵🇹 🇮🇹 🇷🇺
 
313
  """
314
  )
315
 
@@ -351,9 +264,7 @@ with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as
351
  ---
352
  **How it works:** This demo uses the LiquidAI LFM2-ColBERT-350M model with late interaction retrieval.
353
  The model encodes both queries and documents into token-level embeddings, enabling fine-grained matching
354
- across languages with impressive speed and accuracy.
355
-
356
- Built with [PyLate](https://github.com/lightonai/pylate) and [Gradio](https://gradio.app).
357
  """
358
  )
359
 
 
1
  import subprocess
2
  import sys
3
 
 
 
4
  subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "transformers==4.56.2"])
 
5
 
6
  import logging
7
  from typing import List, Dict, Tuple
8
  import gradio as gr
9
  from pylate import indexes, models, retrieve
10
+ from documents import MULTILINGUAL_DOCUMENTS
11
 
12
  # Configure logging
13
  logging.basicConfig(
 
17
  logger = logging.getLogger(__name__)
18
 
19
 
 
20
  class CrossLingualRetriever:
21
  """Cross-lingual retrieval system using LiquidAI's LFM2-ColBERT model."""
22
 
 
100
  return results
101
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # Initialize retriever and load documents
104
  retriever = CrossLingualRetriever()
105
  retriever.load_documents(MULTILINGUAL_DOCUMENTS)
 
163
  EXAMPLE_QUERIES = [
164
  ["What is artificial intelligence?", 5],
165
  ["¿Qué es el cambio climático?", 5],
166
+ ["양자 컴퓨팅이란 무엇인가요?", 5],
167
+ ["ما هي الصحة النفسية؟", 5],
168
+ ["量子计算是什么?", 5],
169
+ ["Qu'est-ce que l'économie numérique?", 5],
170
+ ["宇宙探査について教えてください", 5],
171
  ]
172
 
173
 
 
176
  gr.Markdown(
177
  """
178
  # 🌍 Cross-Lingual Document Retrieval
179
+ ### Powered by [LiquidAI/LFM2-ColBERT-350M](https://huggingface.co/LiquidAI/LFM2-ColBERT-350M)
180
+
181
+ Search for documents in any language using queries in any language!
182
 
 
183
  The model finds semantically similar documents regardless of the language mismatch.
184
 
185
+ **Supported Languages:** English, Arabic, Chinese, French, German, Japanese, Korean, and Spanish
186
+
187
+ **Topics:** AI, Climate, Energy, Health, Business, Education, Culture, Space Exploration, and more!
188
  """
189
  )
190
 
 
209
  with gr.Column(scale=1):
210
  gr.Markdown(
211
  """
212
+ ### 📚 Document Corpus
213
 
214
+ **30+ documents** covering:
215
+ - 🤖 **Technology**: AI, Quantum Computing
216
+ - 🌍 **Environment**: Climate, Biodiversity
217
+ - **Energy**: Renewable Sources
218
+ - 🏥 **Health**: Medicine, Mental Wellness
219
+ - 💼 **Business**: Digital Economy, Startups
220
+ - 📖 **Education**: Online Learning
221
+ - 🎭 **Culture**: Global Connectivity
222
+ - 🚀 **Science**: Space Exploration
223
 
224
+ **Languages:**
225
+ 🇬🇧 🇪🇸 🇫🇷 🇩🇪 🇨🇳 🇯🇵 🇸🇦 🇰🇷
226
  """
227
  )
228
 
 
264
  ---
265
  **How it works:** This demo uses the LiquidAI LFM2-ColBERT-350M model with late interaction retrieval.
266
  The model encodes both queries and documents into token-level embeddings, enabling fine-grained matching
267
+ across languages with high speed and accuracy.
 
 
268
  """
269
  )
270