|
|
import os |
|
|
import sys |
|
|
|
|
|
print("CogniChat Dependencies & PDF Handling Test") |
|
|
|
|
|
|
|
|
try: |
|
|
print("\nTesting core imports...") |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_community.retrievers import BM25Retriever |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain_core.documents import Document |
|
|
print("Core LangChain imports successful!") |
|
|
|
|
|
except ImportError as e: |
|
|
print(f"Import error: {e}") |
|
|
if "rank_bm25" in str(e): |
|
|
print("Missing dependency: pip install rank-bm25==0.2.2") |
|
|
sys.exit(1) |
|
|
try: |
|
|
print("\nTesting PDF loading capabilities...") |
|
|
try: |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
print("PyPDFLoader available") |
|
|
except ImportError: |
|
|
print("PyPDFLoader not available") |
|
|
|
|
|
try: |
|
|
import fitz |
|
|
print("PyMuPDF (fitz) available - can handle corrupted PDFs") |
|
|
except ImportError: |
|
|
print("PyMuPDF (fitz) not available") |
|
|
|
|
|
try: |
|
|
import pdfplumber |
|
|
print("pdfplumber available - additional PDF parsing method") |
|
|
except ImportError: |
|
|
print("pdfplumber not available") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error testing PDF capabilities: {e}") |
|
|
try: |
|
|
print("\nTesting BM25 Retriever...") |
|
|
|
|
|
test_docs = [ |
|
|
Document(page_content="This is the first test document about machine learning."), |
|
|
Document(page_content="This is the second document discussing natural language processing."), |
|
|
Document(page_content="The third document covers artificial intelligence topics."), |
|
|
] |
|
|
|
|
|
bm25_retriever = BM25Retriever.from_documents(test_docs) |
|
|
bm25_retriever.k = 2 |
|
|
query = "machine learning" |
|
|
results = bm25_retriever.get_relevant_documents(query) |
|
|
print(f"BM25 retriever created and tested successfully!") |
|
|
print(f"Retrieved {len(results)} documents for query: '{query}'") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"✗ Error testing BM25 retriever: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
sys.exit(1) |
|
|
|
|
|
print("\nAll tests completed successfully!") |
|
|
print("\nThe application should now handle:") |
|
|
print(" • Regular file uploads and processing") |
|
|
print(" • Corrupted PDF files with multiple fallback methods") |
|
|
print(" • BM25 and FAISS hybrid retrieval") |
|
|
print(" • Proper error messages for failed file processing") |
|
|
print("\nMake sure to install all dependencies with:") |
|
|
print(" pip install -r requirements.txt") |
|
|
|
|
|
print("\nKey Dependencies Added/Updated") |
|
|
print(" • rank-bm25==0.2.2 (for BM25 retrieval)") |
|
|
print(" • pymupdf==1.23.26 (PDF fallback method)") |
|
|
print(" • pdfplumber==0.10.3 (additional PDF parsing)") |