Spaces:

alx-d
/

PhiRAG

Running

App Files Files Community

alx-d commited on Mar 19

Commit

7f0ef09

verified ·

1 Parent(s): 106fe41

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

advanced_rag.py +62 -26

advanced_rag.py CHANGED Viewed

@@ -30,10 +30,16 @@ from langchain.llms.base import LLM
 from typing import Any, Optional, List
 import typing
 import time
-import requests
 import re
 print("Pydantic Version: ")
 print(pydantic.__version__)
 # Add Mistral imports with fallback handling
@@ -389,9 +395,9 @@ def load_txt_from_url(url: str) -> Document:
     else:
         raise Exception(f"Failed to load {url} with status {response.status_code}")
-def load_txt_from_google_drive(link: str) -> Document:
     """
-    Load text from a Google Drive shared link
     """
     # Extract the file ID from the Google Drive link
     file_id_match = re.search(r'\/d\/(.*?)\/view', link)
@@ -403,15 +409,52 @@ def load_txt_from_google_drive(link: str) -> Document:
     # Create direct download link
     download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-    # Request the file content
-    response = requests.get(download_url)
     if response.status_code != 200:
         raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
-    # Create a Document object
-    content = response.text
-    metadata = {"source": link}
-    return Document(page_content=content, metadata=metadata)
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
@@ -636,7 +679,15 @@ class ElevatedRagChain:
         debug_print(f"Processing files using {self.llm_choice}")
         self.raw_data = []
         for link in file_links:
-            if link.lower().endswith(".pdf"):
                 debug_print(f"Loading PDF: {link}")
                 loaded_docs = OnlinePDFLoader(link).load()
                 if loaded_docs:
@@ -649,21 +700,6 @@ class ElevatedRagChain:
                     self.raw_data.append(load_txt_from_url(link))
                 except Exception as e:
                     debug_print(f"Error loading TXT file {link}: {e}")
-            elif "drive.google.com" in link and ("file/d" in link or "open?id=" in link):
-                debug_print(f"Loading Google Drive file: {link}")
-                try:
-                    if ".pdf" in link.lower():
-                        # Google Drive PDF handling
-                        file_id = re.search(r'\/d\/(.*?)\/view', link).group(1)
-                        direct_pdf_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-                        loaded_docs = OnlinePDFLoader(direct_pdf_url).load()
-                        if loaded_docs:
-                            self.raw_data.append(loaded_docs[0])
-                    else:
-                        # Assuming it's a text file
-                        self.raw_data.append(load_txt_from_google_drive(link))
-                except Exception as e:
-                    debug_print(f"Error loading Google Drive file {link}: {e}")
             else:
                 debug_print(f"File type not supported for URL: {link}")

 from typing import Any, Optional, List
 import typing
 import time
 import re
+import requests
+from langchain.schema import Document
+from langchain.document_loaders import PyPDFLoader
+import tempfile
+import mimetypes
+def get_mime_type(file_path):
+    return mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
 print("Pydantic Version: ")
 print(pydantic.__version__)
 # Add Mistral imports with fallback handling
     else:
         raise Exception(f"Failed to load {url} with status {response.status_code}")
+def load_file_from_google_drive(link: str) -> list:
     """
+    Load PDF or text from a Google Drive shared link by detecting the file type
     """
     # Extract the file ID from the Google Drive link
     file_id_match = re.search(r'\/d\/(.*?)\/view', link)
     # Create direct download link
     download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+    # Download the file to a temporary location
+    response = requests.get(download_url, stream=True)
     if response.status_code != 200:
         raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+        temp_path = temp_file.name
+        # Write content to the temp file
+        for chunk in response.iter_content(chunk_size=1024):
+            if chunk:
+                temp_file.write(chunk)
+    # With:
+    try:
+        # Detect file type using python-magic
+        mime_type = get_mime_type(temp_path)
+        debug_print(f"Detected MIME type: {mime_type}")
+        if mime_type == 'application/pdf':
+            # Handle PDF file
+            loader = PyPDFLoader(temp_path)
+            documents = loader.load()
+            # Update metadata to include source URL
+            for doc in documents:
+                doc.metadata["source"] = link
+            debug_print(f"Loaded PDF with {len(documents)} pages")
+            return documents
+        else:
+            # Handle as text file
+            with open(temp_path, 'r', encoding='utf-8', errors='ignore') as file:
+                content = file.read()
+            metadata = {"source": link}
+            return [Document(page_content=content, metadata=metadata)]
+    except Exception as e:
+        # Log the error for debugging
+        debug_print(f"Error processing file: {str(e)}")
+        raise e
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
         debug_print(f"Processing files using {self.llm_choice}")
         self.raw_data = []
         for link in file_links:
+            if "drive.google.com" in link and ("file/d" in link or "open?id=" in link):
+                debug_print(f"Loading Google Drive file: {link}")
+                try:
+                    documents = load_file_from_google_drive(link)
+                    self.raw_data.extend(documents)
+                    debug_print(f"Successfully loaded {len(documents)} pages/documents from Google Drive")
+                except Exception as e:
+                    debug_print(f"Error loading Google Drive file {link}: {e}")
+            elif link.lower().endswith(".pdf"):
                 debug_print(f"Loading PDF: {link}")
                 loaded_docs = OnlinePDFLoader(link).load()
                 if loaded_docs:
                     self.raw_data.append(load_txt_from_url(link))
                 except Exception as e:
                     debug_print(f"Error loading TXT file {link}: {e}")
             else:
                 debug_print(f"File type not supported for URL: {link}")