Spaces:

alx-d
/

PhiRAG

Running

App Files Files Community

alx-d commited on Apr 6

Commit

97f878b

verified ·

1 Parent(s): 7f0ef09

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

advanced_rag.py +68 -54
requirements.txt +1 -1

advanced_rag.py CHANGED Viewed

@@ -22,7 +22,6 @@ from langchain.schema import StrOutputParser, Document
 from langchain_core.runnables import RunnableParallel, RunnableLambda
 from transformers.quantizers.auto import AutoQuantizationConfig
 import gradio as gr
-import requests
 from pydantic import PrivateAttr
 import pydantic
@@ -33,7 +32,7 @@ import time
 import re
 import requests
 from langchain.schema import Document
-from langchain.document_loaders import PyPDFLoader
 import tempfile
 import mimetypes
@@ -395,67 +394,82 @@ def load_txt_from_url(url: str) -> Document:
     else:
         raise Exception(f"Failed to load {url} with status {response.status_code}")
 def load_file_from_google_drive(link: str) -> list:
     """
-    Load PDF or text from a Google Drive shared link by detecting the file type
     """
-    # Extract the file ID from the Google Drive link
-    file_id_match = re.search(r'\/d\/(.*?)\/view', link)
-    if not file_id_match:
-        raise ValueError(f"Could not extract file ID from Google Drive link: {link}")
-    file_id = file_id_match.group(1)
-    # Create direct download link
-    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-    # Download the file to a temporary location
-    response = requests.get(download_url, stream=True)
-    if response.status_code != 200:
-        raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
-    # Create a temporary file
     with tempfile.NamedTemporaryFile(delete=False) as temp_file:
         temp_path = temp_file.name
-        # Write content to the temp file
-        for chunk in response.iter_content(chunk_size=1024):
-            if chunk:
-                temp_file.write(chunk)
-    # With:
     try:
-        # Detect file type using python-magic
-        mime_type = get_mime_type(temp_path)
-        debug_print(f"Detected MIME type: {mime_type}")
-        if mime_type == 'application/pdf':
-            # Handle PDF file
-            loader = PyPDFLoader(temp_path)
-            documents = loader.load()
-            # Update metadata to include source URL
-            for doc in documents:
-                doc.metadata["source"] = link
-            debug_print(f"Loaded PDF with {len(documents)} pages")
-            return documents
-        else:
-            # Handle as text file
-            with open(temp_path, 'r', encoding='utf-8', errors='ignore') as file:
-                content = file.read()
-            metadata = {"source": link}
-            return [Document(page_content=content, metadata=metadata)]
-    except Exception as e:
-        # Log the error for debugging
-        debug_print(f"Error processing file: {str(e)}")
-        raise e
     finally:
-        # Clean up the temporary file
         if os.path.exists(temp_path):
-            os.unlink(temp_path)
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
                  bm25_weight: float = 0.6, temperature: float = 0.5, top_p: float = 0.95) -> None:

 from langchain_core.runnables import RunnableParallel, RunnableLambda
 from transformers.quantizers.auto import AutoQuantizationConfig
 import gradio as gr
 from pydantic import PrivateAttr
 import pydantic
 import re
 import requests
 from langchain.schema import Document
+from langchain_community.document_loaders import PyMuPDFLoader  # Updated loader
 import tempfile
 import mimetypes
     else:
         raise Exception(f"Failed to load {url} with status {response.status_code}")
+from pdfminer.high_level import extract_text
+from langchain_core.documents import Document
+def get_confirm_token(response):
+    for key, value in response.cookies.items():
+        if key.startswith("download_warning"):
+            return value
+    return None
+def download_file_from_google_drive(file_id, destination):
+    """
+    Download a file from Google Drive handling large file confirmation.
+    """
+    URL = "https://docs.google.com/uc?export=download&confirm=1"
+    session = requests.Session()
+    response = session.get(URL, params={"id": file_id}, stream=True)
+    token = get_confirm_token(response)
+    if token:
+        params = {"id": file_id, "confirm": token}
+        response = session.get(URL, params=params, stream=True)
+    save_response_content(response, destination)
+def save_response_content(response, destination):
+    CHUNK_SIZE = 32768
+    with open(destination, "wb") as f:
+        for chunk in response.iter_content(CHUNK_SIZE):
+            if chunk:
+                f.write(chunk)
+def extract_file_id(drive_link: str) -> str:
+    match = re.search(r"/d/([a-zA-Z0-9_-]+)", drive_link)
+    if match:
+        return match.group(1)
+    raise ValueError("Could not extract file ID from the provided Google Drive link.")
 def load_file_from_google_drive(link: str) -> list:
     """
+    Load a document from a Google Drive link using pdfminer to extract text.
+    Returns a list of LangChain Document objects.
     """
+    file_id = extract_file_id(link)
+    print(f"[DEBUG] Extracted file ID: {file_id}")
     with tempfile.NamedTemporaryFile(delete=False) as temp_file:
         temp_path = temp_file.name
     try:
+        download_file_from_google_drive(file_id, temp_path)
+        print(f"[DEBUG] File downloaded to: {temp_path}")
+        try:
+            full_text = extract_text(temp_path)
+            if not full_text.strip():
+                raise ValueError("Extracted text is empty. The PDF might be image-based.")
+            print("[DEBUG] Extracted preview text from PDF:")
+            print(full_text[:1000])  # Preview first 500 characters
+            document = Document(page_content=full_text, metadata={"source": link})
+            return [document]
+        except Exception as e:
+            print(f"[ERROR] Could not extract text from PDF: {e}")
+            return []
     finally:
         if os.path.exists(temp_path):
+            os.remove(temp_path)
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
                  bm25_weight: float = 0.6, temperature: float = 0.5, top_p: float = 0.95) -> None:

requirements.txt CHANGED Viewed

@@ -46,4 +46,4 @@ pydantic==2.9.0
 sentence-transformers>=2.4.0
-mistralai==1.5.0


46
47	sentence-transformers>=2.4.0
48
49	+ mistralai==1.5.0