alx-d commited on
Commit
7f0ef09
·
verified ·
1 Parent(s): 106fe41

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. advanced_rag.py +62 -26
advanced_rag.py CHANGED
@@ -30,10 +30,16 @@ from langchain.llms.base import LLM
30
  from typing import Any, Optional, List
31
  import typing
32
  import time
33
- import requests
34
  import re
 
 
 
 
 
35
 
36
-
 
 
37
  print("Pydantic Version: ")
38
  print(pydantic.__version__)
39
  # Add Mistral imports with fallback handling
@@ -389,9 +395,9 @@ def load_txt_from_url(url: str) -> Document:
389
  else:
390
  raise Exception(f"Failed to load {url} with status {response.status_code}")
391
 
392
- def load_txt_from_google_drive(link: str) -> Document:
393
  """
394
- Load text from a Google Drive shared link
395
  """
396
  # Extract the file ID from the Google Drive link
397
  file_id_match = re.search(r'\/d\/(.*?)\/view', link)
@@ -403,15 +409,52 @@ def load_txt_from_google_drive(link: str) -> Document:
403
  # Create direct download link
404
  download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
405
 
406
- # Request the file content
407
- response = requests.get(download_url)
408
  if response.status_code != 200:
409
  raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
410
 
411
- # Create a Document object
412
- content = response.text
413
- metadata = {"source": link}
414
- return Document(page_content=content, metadata=metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
  class ElevatedRagChain:
417
  def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
@@ -636,7 +679,15 @@ class ElevatedRagChain:
636
  debug_print(f"Processing files using {self.llm_choice}")
637
  self.raw_data = []
638
  for link in file_links:
639
- if link.lower().endswith(".pdf"):
 
 
 
 
 
 
 
 
640
  debug_print(f"Loading PDF: {link}")
641
  loaded_docs = OnlinePDFLoader(link).load()
642
  if loaded_docs:
@@ -649,21 +700,6 @@ class ElevatedRagChain:
649
  self.raw_data.append(load_txt_from_url(link))
650
  except Exception as e:
651
  debug_print(f"Error loading TXT file {link}: {e}")
652
- elif "drive.google.com" in link and ("file/d" in link or "open?id=" in link):
653
- debug_print(f"Loading Google Drive file: {link}")
654
- try:
655
- if ".pdf" in link.lower():
656
- # Google Drive PDF handling
657
- file_id = re.search(r'\/d\/(.*?)\/view', link).group(1)
658
- direct_pdf_url = f"https://drive.google.com/uc?export=download&id={file_id}"
659
- loaded_docs = OnlinePDFLoader(direct_pdf_url).load()
660
- if loaded_docs:
661
- self.raw_data.append(loaded_docs[0])
662
- else:
663
- # Assuming it's a text file
664
- self.raw_data.append(load_txt_from_google_drive(link))
665
- except Exception as e:
666
- debug_print(f"Error loading Google Drive file {link}: {e}")
667
  else:
668
  debug_print(f"File type not supported for URL: {link}")
669
 
 
30
  from typing import Any, Optional, List
31
  import typing
32
  import time
 
33
  import re
34
+ import requests
35
+ from langchain.schema import Document
36
+ from langchain.document_loaders import PyPDFLoader
37
+ import tempfile
38
+ import mimetypes
39
 
40
+ def get_mime_type(file_path):
41
+ return mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
42
+
43
  print("Pydantic Version: ")
44
  print(pydantic.__version__)
45
  # Add Mistral imports with fallback handling
 
395
  else:
396
  raise Exception(f"Failed to load {url} with status {response.status_code}")
397
 
398
+ def load_file_from_google_drive(link: str) -> list:
399
  """
400
+ Load PDF or text from a Google Drive shared link by detecting the file type
401
  """
402
  # Extract the file ID from the Google Drive link
403
  file_id_match = re.search(r'\/d\/(.*?)\/view', link)
 
409
  # Create direct download link
410
  download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
411
 
412
+ # Download the file to a temporary location
413
+ response = requests.get(download_url, stream=True)
414
  if response.status_code != 200:
415
  raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
416
 
417
+ # Create a temporary file
418
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
419
+ temp_path = temp_file.name
420
+ # Write content to the temp file
421
+ for chunk in response.iter_content(chunk_size=1024):
422
+ if chunk:
423
+ temp_file.write(chunk)
424
+ # With:
425
+
426
+ try:
427
+ # Detect file type using python-magic
428
+ mime_type = get_mime_type(temp_path)
429
+ debug_print(f"Detected MIME type: {mime_type}")
430
+
431
+ if mime_type == 'application/pdf':
432
+ # Handle PDF file
433
+ loader = PyPDFLoader(temp_path)
434
+ documents = loader.load()
435
+
436
+ # Update metadata to include source URL
437
+ for doc in documents:
438
+ doc.metadata["source"] = link
439
+
440
+ debug_print(f"Loaded PDF with {len(documents)} pages")
441
+ return documents
442
+ else:
443
+ # Handle as text file
444
+ with open(temp_path, 'r', encoding='utf-8', errors='ignore') as file:
445
+ content = file.read()
446
+
447
+ metadata = {"source": link}
448
+ return [Document(page_content=content, metadata=metadata)]
449
+ except Exception as e:
450
+ # Log the error for debugging
451
+ debug_print(f"Error processing file: {str(e)}")
452
+ raise e
453
+ finally:
454
+ # Clean up the temporary file
455
+ if os.path.exists(temp_path):
456
+ os.unlink(temp_path)
457
+
458
 
459
  class ElevatedRagChain:
460
  def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
 
679
  debug_print(f"Processing files using {self.llm_choice}")
680
  self.raw_data = []
681
  for link in file_links:
682
+ if "drive.google.com" in link and ("file/d" in link or "open?id=" in link):
683
+ debug_print(f"Loading Google Drive file: {link}")
684
+ try:
685
+ documents = load_file_from_google_drive(link)
686
+ self.raw_data.extend(documents)
687
+ debug_print(f"Successfully loaded {len(documents)} pages/documents from Google Drive")
688
+ except Exception as e:
689
+ debug_print(f"Error loading Google Drive file {link}: {e}")
690
+ elif link.lower().endswith(".pdf"):
691
  debug_print(f"Loading PDF: {link}")
692
  loaded_docs = OnlinePDFLoader(link).load()
693
  if loaded_docs:
 
700
  self.raw_data.append(load_txt_from_url(link))
701
  except Exception as e:
702
  debug_print(f"Error loading TXT file {link}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
  else:
704
  debug_print(f"File type not supported for URL: {link}")
705