import subprocess import sys # Install specific versions at runtime print("Installing dependencies...") subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "transformers==4.56.2"]) print("Dependencies installed successfully!") import logging from typing import List, Dict, Tuple import gradio as gr from pylate import indexes, models, retrieve # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) class CrossLingualRetriever: """Cross-lingual retrieval system using LiquidAI's LFM2-ColBERT model.""" def __init__(self, model_name: str = "LiquidAI/LFM2-ColBERT-350M-RC"): """Initialize the retriever with model and index.""" logger.info(f"Loading model: {model_name}") self.model = models.ColBERT(model_name_or_path=model_name) # Set padding token if not present if self.model.tokenizer.pad_token is None and hasattr(self.model.tokenizer, "eos_token"): self.model.tokenizer.pad_token = self.model.tokenizer.eos_token # Initialize PLAID index self.index = indexes.PLAID( index_folder="pylate-index", index_name="cross_lingual_index", override=True, ) self.retriever = retrieve.ColBERT(index=self.index) self.documents_data = [] logger.info("Model and index initialized successfully") def load_documents(self, documents: List[Dict[str, str]]) -> None: """Load and index multilingual documents.""" logger.info(f"Loading {len(documents)} documents") self.documents_data = documents documents_ids = [doc["id"] for doc in documents] documents_text = [doc["text"] for doc in documents] # Encode documents documents_embeddings = self.model.encode( documents_text, batch_size=32, is_query=False, show_progress_bar=True, ) # Add to index self.index.add_documents( documents_ids=documents_ids, documents_embeddings=documents_embeddings, ) logger.info("Documents indexed successfully") def search(self, query: str, k: int = 5) -> List[Dict]: """Perform cross-lingual search.""" logger.info(f"Searching for: {query}") # Encode query query_embedding = self.model.encode( [query], batch_size=32, is_query=True, show_progress_bar=False, ) # Retrieve results scores = self.retriever.retrieve( queries_embeddings=query_embedding, k=k, ) # Format results results = [] for score in scores[0]: doc = next((d for d in self.documents_data if d["id"] == score["id"]), None) if doc: results.append({ "id": score["id"], "score": round(score["score"], 4), "text": doc["text"], "language": doc["language"], "title": doc["title"] }) return results # Multilingual document corpus MULTILINGUAL_DOCUMENTS = [ { "id": "en_1", "language": "English", "title": "Artificial Intelligence Overview", "text": "Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self-correction." }, { "id": "es_1", "language": "Spanish", "title": "Inteligencia Artificial", "text": "La inteligencia artificial es la simulación de procesos de inteligencia humana por parte de máquinas, especialmente sistemas informáticos. Estos procesos incluyen el aprendizaje, el razonamiento y la autocorrección." }, { "id": "fr_1", "language": "French", "title": "Intelligence Artificielle", "text": "L'intelligence artificielle est la simulation des processus d'intelligence humaine par des machines, en particulier des systèmes informatiques. Ces processus comprennent l'apprentissage, le raisonnement et l'autocorrection." }, { "id": "de_1", "language": "German", "title": "Künstliche Intelligenz", "text": "Künstliche Intelligenz ist die Simulation menschlicher Intelligenzprozesse durch Maschinen, insbesondere Computersysteme. Diese Prozesse umfassen Lernen, Argumentieren und Selbstkorrektur." }, { "id": "en_2", "language": "English", "title": "Climate Change Impact", "text": "Climate change refers to long-term shifts in global temperatures and weather patterns. These shifts may be natural, but since the 1800s, human activities have been the main driver of climate change." }, { "id": "es_2", "language": "Spanish", "title": "Cambio Climático", "text": "El cambio climático se refiere a cambios a largo plazo en las temperaturas globales y los patrones climáticos. Estos cambios pueden ser naturales, pero desde el siglo XIX, las actividades humanas han sido el principal impulsor del cambio climático." }, { "id": "fr_2", "language": "French", "title": "Changement Climatique", "text": "Le changement climatique fait référence aux changements à long terme des températures mondiales et des conditions météorologiques. Ces changements peuvent être naturels, mais depuis les années 1800, les activités humaines sont le principal moteur du changement climatique." }, { "id": "zh_1", "language": "Chinese", "title": "人工智能", "text": "人工智能是机器(尤其是计算机系统)对人类智能过程的模拟。这些过程包括学习、推理和自我纠正。" }, { "id": "ja_1", "language": "Japanese", "title": "人工知能", "text": "人工知能とは、機械、特にコンピュータシステムによる人間の知能プロセスのシミュレーションです。これらのプロセスには、学習、推論、自己修正が含まれます。" }, { "id": "ar_1", "language": "Arabic", "title": "الذكاء الاصطناعي", "text": "الذكاء الاصطناعي هو محاكاة عمليات الذكاء البشري بواسطة الآلات، وخاصة أنظمة الكمبيوتر. تشمل هذه العمليات التعلم والاستدلال والتصحيح الذاتي." }, { "id": "en_3", "language": "English", "title": "Renewable Energy Sources", "text": "Renewable energy comes from natural sources that are constantly replenished, such as sunlight, wind, rain, tides, waves, and geothermal heat. These sources are sustainable and environmentally friendly." }, { "id": "de_2", "language": "German", "title": "Erneuerbare Energien", "text": "Erneuerbare Energie stammt aus natürlichen Quellen, die ständig nachgefüllt werden, wie Sonnenlicht, Wind, Regen, Gezeiten, Wellen und geothermische Wärme. Diese Quellen sind nachhaltig und umweltfreundlich." }, { "id": "pt_1", "language": "Portuguese", "title": "Energia Renovável", "text": "A energia renovável vem de fontes naturais que são constantemente reabastecidas, como luz solar, vento, chuva, marés, ondas e calor geotérmico. Essas fontes são sustentáveis e ambientalmente amigáveis." }, { "id": "it_1", "language": "Italian", "title": "Energia Rinnovabile", "text": "L'energia rinnovabile proviene da fonti naturali che vengono costantemente reintegrate, come la luce solare, il vento, la pioggia, le maree, le onde e il calore geotermico. Queste fonti sono sostenibili ed ecologiche." }, { "id": "ru_1", "language": "Russian", "title": "Искусственный Интеллект", "text": "Искусственный интеллект - это имитация процессов человеческого интеллекта машинами, особенно компьютерными системами. Эти процессы включают обучение, рассуждение и самокоррекцию." }, ] # Initialize retriever and load documents retriever = CrossLingualRetriever() retriever.load_documents(MULTILINGUAL_DOCUMENTS) def format_results(results: List[Dict]) -> str: """Format search results as HTML for better visualization.""" if not results: return "