mlabonne commited on
Commit
0d1d7d7
·
verified ·
1 Parent(s): da458bf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +353 -0
app.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Tuple
3
+ import gradio as gr
4
+ from pylate import indexes, models, retrieve
5
+
6
+ # Configure logging
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
10
+ )
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class CrossLingualRetriever:
15
+ """Cross-lingual retrieval system using LiquidAI's LFM2-ColBERT model."""
16
+
17
+ def __init__(self, model_name: str = "LiquidAI/LFM2-ColBERT-350M-RC"):
18
+ """Initialize the retriever with model and index."""
19
+ logger.info(f"Loading model: {model_name}")
20
+
21
+ self.model = models.ColBERT(model_name_or_path=model_name)
22
+
23
+ # Set padding token if not present
24
+ if self.model.tokenizer.pad_token is None and hasattr(self.model.tokenizer, "eos_token"):
25
+ self.model.tokenizer.pad_token = self.model.tokenizer.eos_token
26
+
27
+ # Initialize PLAID index
28
+ self.index = indexes.PLAID(
29
+ index_folder="pylate-index",
30
+ index_name="cross_lingual_index",
31
+ override=True,
32
+ )
33
+
34
+ self.retriever = retrieve.ColBERT(index=self.index)
35
+ self.documents_data = []
36
+
37
+ logger.info("Model and index initialized successfully")
38
+
39
+ def load_documents(self, documents: List[Dict[str, str]]) -> None:
40
+ """Load and index multilingual documents."""
41
+ logger.info(f"Loading {len(documents)} documents")
42
+
43
+ self.documents_data = documents
44
+ documents_ids = [doc["id"] for doc in documents]
45
+ documents_text = [doc["text"] for doc in documents]
46
+
47
+ # Encode documents
48
+ documents_embeddings = self.model.encode(
49
+ documents_text,
50
+ batch_size=32,
51
+ is_query=False,
52
+ show_progress_bar=True,
53
+ )
54
+
55
+ # Add to index
56
+ self.index.add_documents(
57
+ documents_ids=documents_ids,
58
+ documents_embeddings=documents_embeddings,
59
+ )
60
+
61
+ logger.info("Documents indexed successfully")
62
+
63
+ def search(self, query: str, k: int = 5) -> List[Dict]:
64
+ """Perform cross-lingual search."""
65
+ logger.info(f"Searching for: {query}")
66
+
67
+ # Encode query
68
+ query_embedding = self.model.encode(
69
+ [query],
70
+ batch_size=32,
71
+ is_query=True,
72
+ show_progress_bar=False,
73
+ )
74
+
75
+ # Retrieve results
76
+ scores = self.retriever.retrieve(
77
+ queries_embeddings=query_embedding,
78
+ k=k,
79
+ )
80
+
81
+ # Format results
82
+ results = []
83
+ for score in scores[0]:
84
+ doc = next((d for d in self.documents_data if d["id"] == score["id"]), None)
85
+ if doc:
86
+ results.append({
87
+ "id": score["id"],
88
+ "score": round(score["score"], 4),
89
+ "text": doc["text"],
90
+ "language": doc["language"],
91
+ "title": doc["title"]
92
+ })
93
+
94
+ return results
95
+
96
+
97
+ # Multilingual document corpus
98
+ MULTILINGUAL_DOCUMENTS = [
99
+ {
100
+ "id": "en_1",
101
+ "language": "English",
102
+ "title": "Artificial Intelligence Overview",
103
+ "text": "Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self-correction."
104
+ },
105
+ {
106
+ "id": "es_1",
107
+ "language": "Spanish",
108
+ "title": "Inteligencia Artificial",
109
+ "text": "La inteligencia artificial es la simulación de procesos de inteligencia humana por parte de máquinas, especialmente sistemas informáticos. Estos procesos incluyen el aprendizaje, el razonamiento y la autocorrección."
110
+ },
111
+ {
112
+ "id": "fr_1",
113
+ "language": "French",
114
+ "title": "Intelligence Artificielle",
115
+ "text": "L'intelligence artificielle est la simulation des processus d'intelligence humaine par des machines, en particulier des systèmes informatiques. Ces processus comprennent l'apprentissage, le raisonnement et l'autocorrection."
116
+ },
117
+ {
118
+ "id": "de_1",
119
+ "language": "German",
120
+ "title": "Künstliche Intelligenz",
121
+ "text": "Künstliche Intelligenz ist die Simulation menschlicher Intelligenzprozesse durch Maschinen, insbesondere Computersysteme. Diese Prozesse umfassen Lernen, Argumentieren und Selbstkorrektur."
122
+ },
123
+ {
124
+ "id": "en_2",
125
+ "language": "English",
126
+ "title": "Climate Change Impact",
127
+ "text": "Climate change refers to long-term shifts in global temperatures and weather patterns. These shifts may be natural, but since the 1800s, human activities have been the main driver of climate change."
128
+ },
129
+ {
130
+ "id": "es_2",
131
+ "language": "Spanish",
132
+ "title": "Cambio Climático",
133
+ "text": "El cambio climático se refiere a cambios a largo plazo en las temperaturas globales y los patrones climáticos. Estos cambios pueden ser naturales, pero desde el siglo XIX, las actividades humanas han sido el principal impulsor del cambio climático."
134
+ },
135
+ {
136
+ "id": "fr_2",
137
+ "language": "French",
138
+ "title": "Changement Climatique",
139
+ "text": "Le changement climatique fait référence aux changements à long terme des températures mondiales et des conditions météorologiques. Ces changements peuvent être naturels, mais depuis les années 1800, les activités humaines sont le principal moteur du changement climatique."
140
+ },
141
+ {
142
+ "id": "zh_1",
143
+ "language": "Chinese",
144
+ "title": "人工智能",
145
+ "text": "人工智能是机器(尤其是计算机系统)对人类智能过程的模拟。这些过程包括学习、推理和自我纠正。"
146
+ },
147
+ {
148
+ "id": "ja_1",
149
+ "language": "Japanese",
150
+ "title": "人工知能",
151
+ "text": "人工知能とは、機械、特にコンピュータシステムによる人間の知能プロセスのシミュレーションです。これらのプロセスには、学習、推論、自己修正が含まれます。"
152
+ },
153
+ {
154
+ "id": "ar_1",
155
+ "language": "Arabic",
156
+ "title": "الذكاء الاصطناعي",
157
+ "text": "الذكاء الاصطناعي هو محاكاة عمليات الذكاء البشري بواسطة الآلات، وخاصة أنظمة الكمبيوتر. تشمل هذه العمليات التعلم والاستدلال والتصحيح الذاتي."
158
+ },
159
+ {
160
+ "id": "en_3",
161
+ "language": "English",
162
+ "title": "Renewable Energy Sources",
163
+ "text": "Renewable energy comes from natural sources that are constantly replenished, such as sunlight, wind, rain, tides, waves, and geothermal heat. These sources are sustainable and environmentally friendly."
164
+ },
165
+ {
166
+ "id": "de_2",
167
+ "language": "German",
168
+ "title": "Erneuerbare Energien",
169
+ "text": "Erneuerbare Energie stammt aus natürlichen Quellen, die ständig nachgefüllt werden, wie Sonnenlicht, Wind, Regen, Gezeiten, Wellen und geothermische Wärme. Diese Quellen sind nachhaltig und umweltfreundlich."
170
+ },
171
+ {
172
+ "id": "pt_1",
173
+ "language": "Portuguese",
174
+ "title": "Energia Renovável",
175
+ "text": "A energia renovável vem de fontes naturais que são constantemente reabastecidas, como luz solar, vento, chuva, marés, ondas e calor geotérmico. Essas fontes são sustentáveis e ambientalmente amigáveis."
176
+ },
177
+ {
178
+ "id": "it_1",
179
+ "language": "Italian",
180
+ "title": "Energia Rinnovabile",
181
+ "text": "L'energia rinnovabile proviene da fonti naturali che vengono costantemente reintegrate, come la luce solare, il vento, la pioggia, le maree, le onde e il calore geotermico. Queste fonti sono sostenibili ed ecologiche."
182
+ },
183
+ {
184
+ "id": "ru_1",
185
+ "language": "Russian",
186
+ "title": "Искусственный Интеллект",
187
+ "text": "Искусственный интеллект - это имитация процессов человеческого интеллекта машинами, особенно компьютерными системами. Эти процессы включают обучение, рассуждение и самокоррекцию."
188
+ },
189
+ ]
190
+
191
+
192
+ # Initialize retriever and load documents
193
+ retriever = CrossLingualRetriever()
194
+ retriever.load_documents(MULTILINGUAL_DOCUMENTS)
195
+
196
+
197
+ def format_results(results: List[Dict]) -> str:
198
+ """Format search results as HTML for better visualization."""
199
+ if not results:
200
+ return "<div style='padding: 20px; text-align: center; color: #666;'>No results found</div>"
201
+
202
+ html = "<div style='font-family: Arial, sans-serif;'>"
203
+
204
+ for i, result in enumerate(results, 1):
205
+ score_color = "#22c55e" if result["score"] > 30 else "#eab308" if result["score"] > 20 else "#ef4444"
206
+
207
+ html += f"""
208
+ <div style='margin-bottom: 20px; padding: 15px; border: 1px solid #e5e7eb; border-radius: 8px; background: #f9fafb;'>
209
+ <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;'>
210
+ <div>
211
+ <span style='font-weight: bold; font-size: 16px;'>#{i} {result["title"]}</span>
212
+ <span style='margin-left: 10px; padding: 2px 8px; background: #dbeafe; color: #1e40af; border-radius: 4px; font-size: 12px;'>{result["language"]}</span>
213
+ </div>
214
+ <span style='padding: 4px 12px; background: {score_color}; color: white; border-radius: 4px; font-weight: bold;'>
215
+ Score: {result["score"]}
216
+ </span>
217
+ </div>
218
+ <div style='color: #374151; line-height: 1.6;'>
219
+ {result["text"]}
220
+ </div>
221
+ </div>
222
+ """
223
+
224
+ html += "</div>"
225
+ return html
226
+
227
+
228
+ def search_documents(query: str, top_k: int) -> Tuple[str, str]:
229
+ """Search documents and return formatted results."""
230
+ if not query.strip():
231
+ return "", "Please enter a search query."
232
+
233
+ try:
234
+ results = retriever.search(query, k=min(top_k, 10))
235
+ formatted_results = format_results(results)
236
+
237
+ # Create summary
238
+ if results:
239
+ languages_found = set(r["language"] for r in results)
240
+ summary = f"✅ Found {len(results)} relevant documents across {len(languages_found)} language(s): {', '.join(sorted(languages_found))}"
241
+ else:
242
+ summary = "❌ No relevant documents found."
243
+
244
+ return formatted_results, summary
245
+
246
+ except Exception as e:
247
+ logger.error(f"Search error: {e}")
248
+ return "", f"❌ Error during search: {str(e)}"
249
+
250
+
251
+ # Example queries in different languages
252
+ EXAMPLE_QUERIES = [
253
+ ["What is artificial intelligence?", 5],
254
+ ["¿Qué es el cambio climático?", 5],
255
+ ["Qu'est-ce que l'énergie renouvelable?", 5],
256
+ ["人工知能とは何ですか?", 5],
257
+ ["Was ist künstliche Intelligenz?", 3],
258
+ ]
259
+
260
+
261
+ # Build Gradio interface
262
+ with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as demo:
263
+ gr.Markdown(
264
+ """
265
+ # 🌍 Cross-Lingual Document Retrieval
266
+ ### Powered by LiquidAI/LFM2-ColBERT-350M
267
+
268
+ This demo showcases **cross-lingual retrieval** - search for documents in any language using queries in any language!
269
+ The model finds semantically similar documents regardless of the language mismatch.
270
+
271
+ Try searching in English, Spanish, French, German, Chinese, Japanese, Arabic, or any other language!
272
+ """
273
+ )
274
+
275
+ with gr.Row():
276
+ with gr.Column(scale=2):
277
+ query_input = gr.Textbox(
278
+ label="🔍 Enter your query (in any language)",
279
+ placeholder="E.g., 'artificial intelligence', 'cambio climático', 'energie renouvelable'...",
280
+ lines=2
281
+ )
282
+
283
+ top_k_slider = gr.Slider(
284
+ minimum=1,
285
+ maximum=10,
286
+ value=5,
287
+ step=1,
288
+ label="Number of results to retrieve",
289
+ )
290
+
291
+ search_btn = gr.Button("Search", variant="primary", size="lg")
292
+
293
+ with gr.Column(scale=1):
294
+ gr.Markdown(
295
+ """
296
+ ### 📚 Available Documents
297
+
298
+ The corpus contains documents about:
299
+ - **Artificial Intelligence**
300
+ - **Climate Change**
301
+ - **Renewable Energy**
302
+
303
+ In languages: 🇬🇧 🇪🇸 🇫🇷 🇩🇪 🇨🇳 🇯🇵 🇸🇦 🇵🇹 🇮🇹 🇷🇺
304
+ """
305
+ )
306
+
307
+ summary_output = gr.Textbox(
308
+ label="📊 Search Summary",
309
+ interactive=False,
310
+ lines=2
311
+ )
312
+
313
+ results_output = gr.HTML(
314
+ label="🎯 Search Results"
315
+ )
316
+
317
+ # Event handlers
318
+ search_btn.click(
319
+ fn=search_documents,
320
+ inputs=[query_input, top_k_slider],
321
+ outputs=[results_output, summary_output]
322
+ )
323
+
324
+ query_input.submit(
325
+ fn=search_documents,
326
+ inputs=[query_input, top_k_slider],
327
+ outputs=[results_output, summary_output]
328
+ )
329
+
330
+ # Examples section
331
+ gr.Markdown("### 💡 Try these example queries:")
332
+ gr.Examples(
333
+ examples=EXAMPLE_QUERIES,
334
+ inputs=[query_input, top_k_slider],
335
+ outputs=[results_output, summary_output],
336
+ fn=search_documents,
337
+ cache_examples=False,
338
+ )
339
+
340
+ gr.Markdown(
341
+ """
342
+ ---
343
+ **How it works:** This demo uses the LiquidAI LFM2-ColBERT-350M model with late interaction retrieval.
344
+ The model encodes both queries and documents into token-level embeddings, enabling fine-grained matching
345
+ across languages with impressive speed and accuracy.
346
+
347
+ Built with [PyLate](https://github.com/lightonai/pylate) and [Gradio](https://gradio.app).
348
+ """
349
+ )
350
+
351
+
352
+ if __name__ == "__main__":
353
+ demo.launch()