Spaces:
Running
Running
| import os | |
| from typing import List | |
| from llama_index.embeddings.nebius import NebiusEmbedding | |
| from llama_index.llms.nebius import NebiusLLM | |
| from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch | |
| from pymongo import MongoClient | |
| from pymongo.operations import SearchIndexModel | |
| llm = NebiusLLM( | |
| model="meta-llama/Llama-3.3-70B-Instruct-fast", api_key=os.getenv("NEBIUS_API_KEY") | |
| ) | |
| embed_model = NebiusEmbedding( | |
| model_name="BAAI/bge-en-icl", | |
| api_key=os.getenv("NEBIUS_API_KEY"), | |
| embed_batch_size=10, | |
| ) | |
| MONGO_DB_URI = os.getenv("MONGO_DB_URI") | |
| mongo_client = MongoClient(MONGO_DB_URI) | |
| # Database and collection names | |
| DB_NAME = "docmcp" | |
| COLLECTION_NAME = "doc_rag" | |
| REPOS_COLLECTION_NAME = "ingested_repos" | |
| VS_INDEX_NAME = "vector_index" | |
| FTS_INDEX_NAME = "fts_index" | |
| vs_model = SearchIndexModel( | |
| definition={ | |
| "fields": [ | |
| { | |
| "type": "vector", | |
| "path": "embedding", | |
| "numDimensions": 4096, | |
| "similarity": "cosine", | |
| }, | |
| {"type": "filter", "path": "metadata.repo"}, | |
| ] | |
| }, | |
| name=VS_INDEX_NAME, | |
| type="vectorSearch", | |
| ) | |
| fts_model = SearchIndexModel( | |
| definition={"mappings": {"dynamic": False, "fields": {"text": {"type": "string"}}}}, | |
| name=FTS_INDEX_NAME, | |
| type="search", | |
| ) | |
| def get_vector_store(): | |
| collection = mongo_client[DB_NAME][COLLECTION_NAME] | |
| vector_store = MongoDBAtlasVectorSearch( | |
| mongodb_client=mongo_client, | |
| db_name=DB_NAME, | |
| collection_name=COLLECTION_NAME, | |
| vector_index_name=VS_INDEX_NAME, | |
| fulltext_index_name=FTS_INDEX_NAME, | |
| embedding_key="embedding", | |
| text_key="text", | |
| ) | |
| collection.create_search_indexes(models=[vs_model, fts_model]) | |
| return vector_store | |
| def get_repos_collection(): | |
| return mongo_client[DB_NAME][REPOS_COLLECTION_NAME] | |
| def store_ingested_repo(repo_name: str, ingested_files: List[str]) -> bool: | |
| try: | |
| repos_collection = get_repos_collection() | |
| # Simple document format | |
| repo_doc = { | |
| "_id": repo_name, # Use repo name as unique ID | |
| "repo_name": repo_name, | |
| "ingested_files": ingested_files, | |
| "file_count": len(ingested_files), | |
| } | |
| # Upsert the document (update if exists, insert if not) | |
| repos_collection.replace_one({"_id": repo_name}, repo_doc, upsert=True) | |
| print(f"β Stored repository: {repo_name} with {len(ingested_files)} files") | |
| return True | |
| except Exception as e: | |
| print(f"β Error storing repository data: {e}") | |
| return False | |
| def get_available_repos(): | |
| try: | |
| repos_collection = get_repos_collection() | |
| # Get all repository names | |
| repos = repos_collection.find({}, {"repo_name": 1}) | |
| repo_list = [repo["repo_name"] for repo in repos] | |
| if repo_list: | |
| return sorted(repo_list) | |
| else: | |
| # Fallback to hardcoded list if no repos in database | |
| return [] | |
| except Exception as e: | |
| print(f"Error getting repos from database: {e}") | |
| # Fallback to hardcoded list | |
| return [] | |
| def get_repo_details(): | |
| """Get detailed information about all repositories""" | |
| try: | |
| repos_collection = get_repos_collection() | |
| # Get all repository details | |
| repos = repos_collection.find({}) | |
| repo_details = [] | |
| for repo in repos: | |
| repo_info = { | |
| "repo_name": repo.get("repo_name", "Unknown"), | |
| "file_count": repo.get("file_count", 0), | |
| "last_updated": repo.get("last_updated", "Unknown"), | |
| "ingested_files": repo.get("ingested_files", []) | |
| } | |
| repo_details.append(repo_info) | |
| return repo_details | |
| except Exception as e: | |
| print(f"Error getting repo details: {e}") | |
| return [] | |
| def delete_repository_data(repo_name): | |
| try: | |
| result = { | |
| "success": False, | |
| "message": "", | |
| "vector_docs_deleted": 0, | |
| "repo_record_deleted": False, | |
| } | |
| # Delete from vector store (documents with this repo metadata) | |
| collection = mongo_client[DB_NAME][COLLECTION_NAME] | |
| vector_delete_result = collection.delete_many({"metadata.repo": repo_name}) | |
| result["vector_docs_deleted"] = vector_delete_result.deleted_count | |
| # Delete from repos tracking collection | |
| repos_collection = get_repos_collection() | |
| repo_delete_result = repos_collection.delete_one({"_id": repo_name}) | |
| result["repo_record_deleted"] = repo_delete_result.deleted_count > 0 | |
| if result["vector_docs_deleted"] > 0 or result["repo_record_deleted"]: | |
| result["success"] = True | |
| result["message"] = f"β Successfully deleted repository '{repo_name}'" | |
| if result["vector_docs_deleted"] > 0: | |
| result["message"] += ( | |
| f" ({result['vector_docs_deleted']} documents removed)" | |
| ) | |
| else: | |
| result["message"] = ( | |
| f"β οΈ Repository '{repo_name}' not found or already deleted" | |
| ) | |
| print(result["message"]) | |
| return result | |
| except Exception as e: | |
| error_msg = f"β Error deleting repository '{repo_name}': {str(e)}" | |
| print(error_msg) | |
| return { | |
| "success": False, | |
| "message": error_msg, | |
| "vector_docs_deleted": 0, | |
| "repo_record_deleted": False, | |
| } | |
| def get_repository_stats(): | |
| try: | |
| repos_collection = get_repos_collection() | |
| collection = mongo_client[DB_NAME][COLLECTION_NAME] | |
| # Count total repositories | |
| total_repos = repos_collection.count_documents({}) | |
| # Count total documents in vector store | |
| total_docs = collection.count_documents({}) | |
| # Get total files across all repos | |
| total_files = 0 | |
| repos = repos_collection.find({}, {"file_count": 1}) | |
| for repo in repos: | |
| total_files += repo.get("file_count", 0) | |
| return { | |
| "total_repositories": total_repos, | |
| "total_documents": total_docs, | |
| "total_files": total_files, | |
| } | |
| except Exception as e: | |
| print(f"Error getting repository stats: {e}") | |
| return {"total_repositories": 0, "total_documents": 0, "total_files": 0} | |