Spaces:
Sleeping
Sleeping
| import hashlib | |
| import os | |
| import sqlite3 | |
| from langchain_community.document_loaders import PyPDFLoader | |
| class DuplicateDetector: | |
| def __init__(self, db_path ="persiststorage.db", max_pages = 10): | |
| self.fingerprints_seen = set() | |
| self.db_path = db_path | |
| self.max_pages =max_pages | |
| self._init_db() | |
| def _init_db(self): | |
| conn =sqlite3.connect(self.db_path) | |
| cursor =conn.cursor() | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS documents ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| filename TEXT, | |
| fingerprint TEXT UNIQUE, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| ''') | |
| conn.commit() | |
| conn.close() | |
| def is_duplicate (self, pdf_path): | |
| fingerprints = self.generate_fingerprints(pdf_path) | |
| try : | |
| conn = sqlite3.connect(self.db_path) | |
| cursor = conn.cursor() | |
| cursor.execute("select id from documents where fingerprint =?", (fingerprints,)) | |
| exists = cursor.fetchone() is not None | |
| conn.close() | |
| return exists | |
| except ValueError as e: | |
| raise e | |
| def store_fingerprints(self, pdf_path): | |
| fingerprints = self.generate_fingerprints(pdf_path) | |
| conn = sqlite3.connect(self.db_path) | |
| cursor = conn.cursor() | |
| try : | |
| cursor.execute("INSERT INTO DOCUMENTS(filename, fingerprint) values(?,?)", | |
| (os.path.basename(pdf_path), fingerprints)) | |
| conn.commit() | |
| except ValueError as e: | |
| pass | |
| finally: | |
| conn.close() | |
| def generate_fingerprints(self, pdf_path): | |
| try : | |
| loader = PyPDFLoader(pdf_path) | |
| docs = loader.load() | |
| text = "".join(doc.page_content for doc in docs[:self.max_pages]) | |
| fingerprint = hashlib.sha256(text.encode("utf-8")).hexdigest() | |
| return fingerprint | |
| except ValueError as e: | |
| raise ValueError(f"Failed to fingerprint PDF: {e}") |