Spaces:

LiquidAI
/

LFM2-ColBERT

Running

App Files Files Community

mlabonne commited on 20 days ago

Commit

0d1d7d7

verified ·

1 Parent(s): da458bf

Create app.py

Browse files

Files changed (1) hide show

app.py +353 -0

app.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import logging
+from typing import List, Dict, Tuple
+import gradio as gr
+from pylate import indexes, models, retrieve
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+class CrossLingualRetriever:
+    """Cross-lingual retrieval system using LiquidAI's LFM2-ColBERT model."""
+    def __init__(self, model_name: str = "LiquidAI/LFM2-ColBERT-350M-RC"):
+        """Initialize the retriever with model and index."""
+        logger.info(f"Loading model: {model_name}")
+        self.model = models.ColBERT(model_name_or_path=model_name)
+        # Set padding token if not present
+        if self.model.tokenizer.pad_token is None and hasattr(self.model.tokenizer, "eos_token"):
+            self.model.tokenizer.pad_token = self.model.tokenizer.eos_token
+        # Initialize PLAID index
+        self.index = indexes.PLAID(
+            index_folder="pylate-index",
+            index_name="cross_lingual_index",
+            override=True,
+        )
+        self.retriever = retrieve.ColBERT(index=self.index)
+        self.documents_data = []
+        logger.info("Model and index initialized successfully")
+    def load_documents(self, documents: List[Dict[str, str]]) -> None:
+        """Load and index multilingual documents."""
+        logger.info(f"Loading {len(documents)} documents")
+        self.documents_data = documents
+        documents_ids = [doc["id"] for doc in documents]
+        documents_text = [doc["text"] for doc in documents]
+        # Encode documents
+        documents_embeddings = self.model.encode(
+            documents_text,
+            batch_size=32,
+            is_query=False,
+            show_progress_bar=True,
+        )
+        # Add to index
+        self.index.add_documents(
+            documents_ids=documents_ids,
+            documents_embeddings=documents_embeddings,
+        )
+        logger.info("Documents indexed successfully")
+    def search(self, query: str, k: int = 5) -> List[Dict]:
+        """Perform cross-lingual search."""
+        logger.info(f"Searching for: {query}")
+        # Encode query
+        query_embedding = self.model.encode(
+            [query],
+            batch_size=32,
+            is_query=True,
+            show_progress_bar=False,
+        )
+        # Retrieve results
+        scores = self.retriever.retrieve(
+            queries_embeddings=query_embedding,
+            k=k,
+        )
+        # Format results
+        results = []
+        for score in scores[0]:
+            doc = next((d for d in self.documents_data if d["id"] == score["id"]), None)
+            if doc:
+                results.append({
+                    "id": score["id"],
+                    "score": round(score["score"], 4),
+                    "text": doc["text"],
+                    "language": doc["language"],
+                    "title": doc["title"]
+                })
+        return results
+# Multilingual document corpus
+MULTILINGUAL_DOCUMENTS = [
+    {
+        "id": "en_1",
+        "language": "English",
+        "title": "Artificial Intelligence Overview",
+        "text": "Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self-correction."
+    },
+    {
+        "id": "es_1",
+        "language": "Spanish",
+        "title": "Inteligencia Artificial",
+        "text": "La inteligencia artificial es la simulación de procesos de inteligencia humana por parte de máquinas, especialmente sistemas informáticos. Estos procesos incluyen el aprendizaje, el razonamiento y la autocorrección."
+    },
+    {
+        "id": "fr_1",
+        "language": "French",
+        "title": "Intelligence Artificielle",
+        "text": "L'intelligence artificielle est la simulation des processus d'intelligence humaine par des machines, en particulier des systèmes informatiques. Ces processus comprennent l'apprentissage, le raisonnement et l'autocorrection."
+    },
+    {
+        "id": "de_1",
+        "language": "German",
+        "title": "Künstliche Intelligenz",
+        "text": "Künstliche Intelligenz ist die Simulation menschlicher Intelligenzprozesse durch Maschinen, insbesondere Computersysteme. Diese Prozesse umfassen Lernen, Argumentieren und Selbstkorrektur."
+    },
+    {
+        "id": "en_2",
+        "language": "English",
+        "title": "Climate Change Impact",
+        "text": "Climate change refers to long-term shifts in global temperatures and weather patterns. These shifts may be natural, but since the 1800s, human activities have been the main driver of climate change."
+    },
+    {
+        "id": "es_2",
+        "language": "Spanish",
+        "title": "Cambio Climático",
+        "text": "El cambio climático se refiere a cambios a largo plazo en las temperaturas globales y los patrones climáticos. Estos cambios pueden ser naturales, pero desde el siglo XIX, las actividades humanas han sido el principal impulsor del cambio climático."
+    },
+    {
+        "id": "fr_2",
+        "language": "French",
+        "title": "Changement Climatique",
+        "text": "Le changement climatique fait référence aux changements à long terme des températures mondiales et des conditions météorologiques. Ces changements peuvent être naturels, mais depuis les années 1800, les activités humaines sont le principal moteur du changement climatique."
+    },
+    {
+        "id": "zh_1",
+        "language": "Chinese",
+        "title": "人工智能",
+        "text": "人工智能是机器(尤其是计算机系统)对人类智能过程的模拟。这些过程包括学习、推理和自我纠正。"
+    },
+    {
+        "id": "ja_1",
+        "language": "Japanese",
+        "title": "人工知能",
+        "text": "人工知能とは、機械、特にコンピュータシステムによる人間の知能プロセスのシミュレーションです。これらのプロセスには、学習、推論、自己修正が含まれます。"
+    },
+    {
+        "id": "ar_1",
+        "language": "Arabic",
+        "title": "الذكاء الاصطناعي",
+        "text": "الذكاء الاصطناعي هو محاكاة عمليات الذكاء البشري بواسطة الآلات، وخاصة أنظمة الكمبيوتر. تشمل هذه العمليات التعلم والاستدلال والتصحيح الذاتي."
+    },
+    {
+        "id": "en_3",
+        "language": "English",
+        "title": "Renewable Energy Sources",
+        "text": "Renewable energy comes from natural sources that are constantly replenished, such as sunlight, wind, rain, tides, waves, and geothermal heat. These sources are sustainable and environmentally friendly."
+    },
+    {
+        "id": "de_2",
+        "language": "German",
+        "title": "Erneuerbare Energien",
+        "text": "Erneuerbare Energie stammt aus natürlichen Quellen, die ständig nachgefüllt werden, wie Sonnenlicht, Wind, Regen, Gezeiten, Wellen und geothermische Wärme. Diese Quellen sind nachhaltig und umweltfreundlich."
+    },
+    {
+        "id": "pt_1",
+        "language": "Portuguese",
+        "title": "Energia Renovável",
+        "text": "A energia renovável vem de fontes naturais que são constantemente reabastecidas, como luz solar, vento, chuva, marés, ondas e calor geotérmico. Essas fontes são sustentáveis e ambientalmente amigáveis."
+    },
+    {
+        "id": "it_1",
+        "language": "Italian",
+        "title": "Energia Rinnovabile",
+        "text": "L'energia rinnovabile proviene da fonti naturali che vengono costantemente reintegrate, come la luce solare, il vento, la pioggia, le maree, le onde e il calore geotermico. Queste fonti sono sostenibili ed ecologiche."
+    },
+    {
+        "id": "ru_1",
+        "language": "Russian",
+        "title": "Искусственный Интеллект",
+        "text": "Искусственный интеллект - это имитация процессов человеческого интеллекта машинами, особенно компьютерными системами. Эти процессы включают обучение, рассуждение и самокоррекцию."
+    },
+]
+# Initialize retriever and load documents
+retriever = CrossLingualRetriever()
+retriever.load_documents(MULTILINGUAL_DOCUMENTS)
+def format_results(results: List[Dict]) -> str:
+    """Format search results as HTML for better visualization."""
+    if not results:
+        return "<div style='padding: 20px; text-align: center; color: #666;'>No results found</div>"
+    html = "<div style='font-family: Arial, sans-serif;'>"
+    for i, result in enumerate(results, 1):
+        score_color = "#22c55e" if result["score"] > 30 else "#eab308" if result["score"] > 20 else "#ef4444"
+        html += f"""
+        <div style='margin-bottom: 20px; padding: 15px; border: 1px solid #e5e7eb; border-radius: 8px; background: #f9fafb;'>
+            <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;'>
+                <div>
+                    <span style='font-weight: bold; font-size: 16px;'>#{i} {result["title"]}</span>
+                    <span style='margin-left: 10px; padding: 2px 8px; background: #dbeafe; color: #1e40af; border-radius: 4px; font-size: 12px;'>{result["language"]}</span>
+                </div>
+                <span style='padding: 4px 12px; background: {score_color}; color: white; border-radius: 4px; font-weight: bold;'>
+                    Score: {result["score"]}
+                </span>
+            </div>
+            <div style='color: #374151; line-height: 1.6;'>
+                {result["text"]}
+            </div>
+        </div>
+        """
+    html += "</div>"
+    return html
+def search_documents(query: str, top_k: int) -> Tuple[str, str]:
+    """Search documents and return formatted results."""
+    if not query.strip():
+        return "", "Please enter a search query."
+    try:
+        results = retriever.search(query, k=min(top_k, 10))
+        formatted_results = format_results(results)
+        # Create summary
+        if results:
+            languages_found = set(r["language"] for r in results)
+            summary = f"✅ Found {len(results)} relevant documents across {len(languages_found)} language(s): {', '.join(sorted(languages_found))}"
+        else:
+            summary = "❌ No relevant documents found."
+        return formatted_results, summary
+    except Exception as e:
+        logger.error(f"Search error: {e}")
+        return "", f"❌ Error during search: {str(e)}"
+# Example queries in different languages
+EXAMPLE_QUERIES = [
+    ["What is artificial intelligence?", 5],
+    ["¿Qué es el cambio climático?", 5],
+    ["Qu'est-ce que l'énergie renouvelable?", 5],
+    ["人工知能とは何ですか？", 5],
+    ["Was ist künstliche Intelligenz?", 3],
+]
+# Build Gradio interface
+with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🌍 Cross-Lingual Document Retrieval
+        ### Powered by LiquidAI/LFM2-ColBERT-350M
+        This demo showcases **cross-lingual retrieval** - search for documents in any language using queries in any language!
+        The model finds semantically similar documents regardless of the language mismatch.
+        Try searching in English, Spanish, French, German, Chinese, Japanese, Arabic, or any other language!
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            query_input = gr.Textbox(
+                label="🔍 Enter your query (in any language)",
+                placeholder="E.g., 'artificial intelligence', 'cambio climático', 'energie renouvelable'...",
+                lines=2
+            )
+            top_k_slider = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=5,
+                step=1,
+                label="Number of results to retrieve",
+            )
+            search_btn = gr.Button("Search", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            gr.Markdown(
+                """
+                ### 📚 Available Documents
+                The corpus contains documents about:
+                - **Artificial Intelligence**
+                - **Climate Change**
+                - **Renewable Energy**
+                In languages: 🇬🇧 🇪🇸 🇫🇷 🇩🇪 🇨🇳 🇯🇵 🇸🇦 🇵🇹 🇮🇹 🇷🇺
+                """
+            )
+    summary_output = gr.Textbox(
+        label="📊 Search Summary",
+        interactive=False,
+        lines=2
+    )
+    results_output = gr.HTML(
+        label="🎯 Search Results"
+    )
+    # Event handlers
+    search_btn.click(
+        fn=search_documents,
+        inputs=[query_input, top_k_slider],
+        outputs=[results_output, summary_output]
+    )
+    query_input.submit(
+        fn=search_documents,
+        inputs=[query_input, top_k_slider],
+        outputs=[results_output, summary_output]
+    )
+    # Examples section
+    gr.Markdown("### 💡 Try these example queries:")
+    gr.Examples(
+        examples=EXAMPLE_QUERIES,
+        inputs=[query_input, top_k_slider],
+        outputs=[results_output, summary_output],
+        fn=search_documents,
+        cache_examples=False,
+    )
+    gr.Markdown(
+        """
+        ---
+        **How it works:** This demo uses the LiquidAI LFM2-ColBERT-350M model with late interaction retrieval.
+        The model encodes both queries and documents into token-level embeddings, enabling fine-grained matching
+        across languages with impressive speed and accuracy.
+        Built with [PyLate](https://github.com/lightonai/pylate) and [Gradio](https://gradio.app).
+        """
+    )
+if __name__ == "__main__":
+    demo.launch()