Spaces:
Running
Running
| import logging | |
| from typing import List, Dict, Tuple | |
| import gradio as gr | |
| from pylate import indexes, models, retrieve | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class CrossLingualRetriever: | |
| """Cross-lingual retrieval system using LiquidAI's LFM2-ColBERT model.""" | |
| def __init__(self, model_name: str = "LiquidAI/LFM2-ColBERT-350M-RC"): | |
| """Initialize the retriever with model and index.""" | |
| logger.info(f"Loading model: {model_name}") | |
| self.model = models.ColBERT(model_name_or_path=model_name) | |
| # Set padding token if not present | |
| if self.model.tokenizer.pad_token is None and hasattr(self.model.tokenizer, "eos_token"): | |
| self.model.tokenizer.pad_token = self.model.tokenizer.eos_token | |
| # Initialize PLAID index | |
| self.index = indexes.PLAID( | |
| index_folder="pylate-index", | |
| index_name="cross_lingual_index", | |
| override=True, | |
| ) | |
| self.retriever = retrieve.ColBERT(index=self.index) | |
| self.documents_data = [] | |
| logger.info("Model and index initialized successfully") | |
| def load_documents(self, documents: List[Dict[str, str]]) -> None: | |
| """Load and index multilingual documents.""" | |
| logger.info(f"Loading {len(documents)} documents") | |
| self.documents_data = documents | |
| documents_ids = [doc["id"] for doc in documents] | |
| documents_text = [doc["text"] for doc in documents] | |
| # Encode documents | |
| documents_embeddings = self.model.encode( | |
| documents_text, | |
| batch_size=32, | |
| is_query=False, | |
| show_progress_bar=True, | |
| ) | |
| # Add to index | |
| self.index.add_documents( | |
| documents_ids=documents_ids, | |
| documents_embeddings=documents_embeddings, | |
| ) | |
| logger.info("Documents indexed successfully") | |
| def search(self, query: str, k: int = 5) -> List[Dict]: | |
| """Perform cross-lingual search.""" | |
| logger.info(f"Searching for: {query}") | |
| # Encode query | |
| query_embedding = self.model.encode( | |
| [query], | |
| batch_size=32, | |
| is_query=True, | |
| show_progress_bar=False, | |
| ) | |
| # Retrieve results | |
| scores = self.retriever.retrieve( | |
| queries_embeddings=query_embedding, | |
| k=k, | |
| ) | |
| # Format results | |
| results = [] | |
| for score in scores[0]: | |
| doc = next((d for d in self.documents_data if d["id"] == score["id"]), None) | |
| if doc: | |
| results.append({ | |
| "id": score["id"], | |
| "score": round(score["score"], 4), | |
| "text": doc["text"], | |
| "language": doc["language"], | |
| "title": doc["title"] | |
| }) | |
| return results | |
| # Multilingual document corpus | |
| MULTILINGUAL_DOCUMENTS = [ | |
| { | |
| "id": "en_1", | |
| "language": "English", | |
| "title": "Artificial Intelligence Overview", | |
| "text": "Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self-correction." | |
| }, | |
| { | |
| "id": "es_1", | |
| "language": "Spanish", | |
| "title": "Inteligencia Artificial", | |
| "text": "La inteligencia artificial es la simulación de procesos de inteligencia humana por parte de máquinas, especialmente sistemas informáticos. Estos procesos incluyen el aprendizaje, el razonamiento y la autocorrección." | |
| }, | |
| { | |
| "id": "fr_1", | |
| "language": "French", | |
| "title": "Intelligence Artificielle", | |
| "text": "L'intelligence artificielle est la simulation des processus d'intelligence humaine par des machines, en particulier des systèmes informatiques. Ces processus comprennent l'apprentissage, le raisonnement et l'autocorrection." | |
| }, | |
| { | |
| "id": "de_1", | |
| "language": "German", | |
| "title": "Künstliche Intelligenz", | |
| "text": "Künstliche Intelligenz ist die Simulation menschlicher Intelligenzprozesse durch Maschinen, insbesondere Computersysteme. Diese Prozesse umfassen Lernen, Argumentieren und Selbstkorrektur." | |
| }, | |
| { | |
| "id": "en_2", | |
| "language": "English", | |
| "title": "Climate Change Impact", | |
| "text": "Climate change refers to long-term shifts in global temperatures and weather patterns. These shifts may be natural, but since the 1800s, human activities have been the main driver of climate change." | |
| }, | |
| { | |
| "id": "es_2", | |
| "language": "Spanish", | |
| "title": "Cambio Climático", | |
| "text": "El cambio climático se refiere a cambios a largo plazo en las temperaturas globales y los patrones climáticos. Estos cambios pueden ser naturales, pero desde el siglo XIX, las actividades humanas han sido el principal impulsor del cambio climático." | |
| }, | |
| { | |
| "id": "fr_2", | |
| "language": "French", | |
| "title": "Changement Climatique", | |
| "text": "Le changement climatique fait référence aux changements à long terme des températures mondiales et des conditions météorologiques. Ces changements peuvent être naturels, mais depuis les années 1800, les activités humaines sont le principal moteur du changement climatique." | |
| }, | |
| { | |
| "id": "zh_1", | |
| "language": "Chinese", | |
| "title": "人工智能", | |
| "text": "人工智能是机器(尤其是计算机系统)对人类智能过程的模拟。这些过程包括学习、推理和自我纠正。" | |
| }, | |
| { | |
| "id": "ja_1", | |
| "language": "Japanese", | |
| "title": "人工知能", | |
| "text": "人工知能とは、機械、特にコンピュータシステムによる人間の知能プロセスのシミュレーションです。これらのプロセスには、学習、推論、自己修正が含まれます。" | |
| }, | |
| { | |
| "id": "ar_1", | |
| "language": "Arabic", | |
| "title": "الذكاء الاصطناعي", | |
| "text": "الذكاء الاصطناعي هو محاكاة عمليات الذكاء البشري بواسطة الآلات، وخاصة أنظمة الكمبيوتر. تشمل هذه العمليات التعلم والاستدلال والتصحيح الذاتي." | |
| }, | |
| { | |
| "id": "en_3", | |
| "language": "English", | |
| "title": "Renewable Energy Sources", | |
| "text": "Renewable energy comes from natural sources that are constantly replenished, such as sunlight, wind, rain, tides, waves, and geothermal heat. These sources are sustainable and environmentally friendly." | |
| }, | |
| { | |
| "id": "de_2", | |
| "language": "German", | |
| "title": "Erneuerbare Energien", | |
| "text": "Erneuerbare Energie stammt aus natürlichen Quellen, die ständig nachgefüllt werden, wie Sonnenlicht, Wind, Regen, Gezeiten, Wellen und geothermische Wärme. Diese Quellen sind nachhaltig und umweltfreundlich." | |
| }, | |
| { | |
| "id": "pt_1", | |
| "language": "Portuguese", | |
| "title": "Energia Renovável", | |
| "text": "A energia renovável vem de fontes naturais que são constantemente reabastecidas, como luz solar, vento, chuva, marés, ondas e calor geotérmico. Essas fontes são sustentáveis e ambientalmente amigáveis." | |
| }, | |
| { | |
| "id": "it_1", | |
| "language": "Italian", | |
| "title": "Energia Rinnovabile", | |
| "text": "L'energia rinnovabile proviene da fonti naturali che vengono costantemente reintegrate, come la luce solare, il vento, la pioggia, le maree, le onde e il calore geotermico. Queste fonti sono sostenibili ed ecologiche." | |
| }, | |
| { | |
| "id": "ru_1", | |
| "language": "Russian", | |
| "title": "Искусственный Интеллект", | |
| "text": "Искусственный интеллект - это имитация процессов человеческого интеллекта машинами, особенно компьютерными системами. Эти процессы включают обучение, рассуждение и самокоррекцию." | |
| }, | |
| ] | |
| # Initialize retriever and load documents | |
| retriever = CrossLingualRetriever() | |
| retriever.load_documents(MULTILINGUAL_DOCUMENTS) | |
| def format_results(results: List[Dict]) -> str: | |
| """Format search results as HTML for better visualization.""" | |
| if not results: | |
| return "<div style='padding: 20px; text-align: center; color: #666;'>No results found</div>" | |
| html = "<div style='font-family: Arial, sans-serif;'>" | |
| for i, result in enumerate(results, 1): | |
| score_color = "#22c55e" if result["score"] > 30 else "#eab308" if result["score"] > 20 else "#ef4444" | |
| html += f""" | |
| <div style='margin-bottom: 20px; padding: 15px; border: 1px solid #e5e7eb; border-radius: 8px; background: #f9fafb;'> | |
| <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;'> | |
| <div> | |
| <span style='font-weight: bold; font-size: 16px;'>#{i} {result["title"]}</span> | |
| <span style='margin-left: 10px; padding: 2px 8px; background: #dbeafe; color: #1e40af; border-radius: 4px; font-size: 12px;'>{result["language"]}</span> | |
| </div> | |
| <span style='padding: 4px 12px; background: {score_color}; color: white; border-radius: 4px; font-weight: bold;'> | |
| Score: {result["score"]} | |
| </span> | |
| </div> | |
| <div style='color: #374151; line-height: 1.6;'> | |
| {result["text"]} | |
| </div> | |
| </div> | |
| """ | |
| html += "</div>" | |
| return html | |
| def search_documents(query: str, top_k: int) -> Tuple[str, str]: | |
| """Search documents and return formatted results.""" | |
| if not query.strip(): | |
| return "", "Please enter a search query." | |
| try: | |
| results = retriever.search(query, k=min(top_k, 10)) | |
| formatted_results = format_results(results) | |
| # Create summary | |
| if results: | |
| languages_found = set(r["language"] for r in results) | |
| summary = f"✅ Found {len(results)} relevant documents across {len(languages_found)} language(s): {', '.join(sorted(languages_found))}" | |
| else: | |
| summary = "❌ No relevant documents found." | |
| return formatted_results, summary | |
| except Exception as e: | |
| logger.error(f"Search error: {e}") | |
| return "", f"❌ Error during search: {str(e)}" | |
| # Example queries in different languages | |
| EXAMPLE_QUERIES = [ | |
| ["What is artificial intelligence?", 5], | |
| ["¿Qué es el cambio climático?", 5], | |
| ["Qu'est-ce que l'énergie renouvelable?", 5], | |
| ["人工知能とは何ですか?", 5], | |
| ["Was ist künstliche Intelligenz?", 3], | |
| ] | |
| # Build Gradio interface | |
| with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🌍 Cross-Lingual Document Retrieval | |
| ### Powered by LiquidAI/LFM2-ColBERT-350M | |
| This demo showcases **cross-lingual retrieval** - search for documents in any language using queries in any language! | |
| The model finds semantically similar documents regardless of the language mismatch. | |
| Try searching in English, Spanish, French, German, Chinese, Japanese, Arabic, or any other language! | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| query_input = gr.Textbox( | |
| label="🔍 Enter your query (in any language)", | |
| placeholder="E.g., 'artificial intelligence', 'cambio climático', 'energie renouvelable'...", | |
| lines=2 | |
| ) | |
| top_k_slider = gr.Slider( | |
| minimum=1, | |
| maximum=10, | |
| value=5, | |
| step=1, | |
| label="Number of results to retrieve", | |
| ) | |
| search_btn = gr.Button("Search", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| """ | |
| ### 📚 Available Documents | |
| The corpus contains documents about: | |
| - **Artificial Intelligence** | |
| - **Climate Change** | |
| - **Renewable Energy** | |
| In languages: 🇬🇧 🇪🇸 🇫🇷 🇩🇪 🇨🇳 🇯🇵 🇸🇦 🇵🇹 🇮🇹 🇷🇺 | |
| """ | |
| ) | |
| summary_output = gr.Textbox( | |
| label="📊 Search Summary", | |
| interactive=False, | |
| lines=2 | |
| ) | |
| results_output = gr.HTML( | |
| label="🎯 Search Results" | |
| ) | |
| # Event handlers | |
| search_btn.click( | |
| fn=search_documents, | |
| inputs=[query_input, top_k_slider], | |
| outputs=[results_output, summary_output] | |
| ) | |
| query_input.submit( | |
| fn=search_documents, | |
| inputs=[query_input, top_k_slider], | |
| outputs=[results_output, summary_output] | |
| ) | |
| # Examples section | |
| gr.Markdown("### 💡 Try these example queries:") | |
| gr.Examples( | |
| examples=EXAMPLE_QUERIES, | |
| inputs=[query_input, top_k_slider], | |
| outputs=[results_output, summary_output], | |
| fn=search_documents, | |
| cache_examples=False, | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| **How it works:** This demo uses the LiquidAI LFM2-ColBERT-350M model with late interaction retrieval. | |
| The model encodes both queries and documents into token-level embeddings, enabling fine-grained matching | |
| across languages with impressive speed and accuracy. | |
| Built with [PyLate](https://github.com/lightonai/pylate) and [Gradio](https://gradio.app). | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |