AIxBI_AI_Plagiarism_detection_and_resolution

Runtime error

App Files Files Community

mohitrulzz commited on Aug 4

Commit

da3ff3f

verified ·

1 Parent(s): 27c8f12

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -27

app.py CHANGED Viewed

@@ -108,59 +108,104 @@ def calculate_file_hash(file_path: str) -> str:
             hash_sha256.update(chunk)
     return hash_sha256.hexdigest()
-def extract_text(file_obj) -> Optional[Tuple[str, dict]]:
-    """Enhanced text extraction with metadata"""
     if file_obj is None:
-        return None, None
     name = file_obj.name
     ext = os.path.splitext(name)[1].lower()
     # Copy to temp file preserving extension
     with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
         shutil.copy(file_obj.name, tmp.name)
         tmp_path = tmp.name
-    metadata = {
-        'file_type': ext,
-        'file_size': os.path.getsize(tmp_path),
-        'file_hash': calculate_file_hash(tmp_path)
-    }
     try:
         if ext == ".pdf":
             with pdfplumber.open(tmp_path) as pdf:
                 text = " ".join(page.extract_text() or "" for page in pdf.pages)
-                metadata['page_count'] = len(pdf.pages)
         elif ext == ".docx":
             doc = docx.Document(tmp_path)
             text = " ".join(p.text for p in doc.paragraphs)
-            metadata['paragraph_count'] = len(doc.paragraphs)
         elif ext == ".txt":
             with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
                 text = f.read()
         else:
-            logger.warning(f"Unsupported file type: {ext}")
-            return None, None
-    except Exception as e:
-        logger.error(f"Error extracting text from {name}: {e}")
-        return None, None
     finally:
         try:
             os.unlink(tmp_path)
         except:
             pass
-    if not text or len(text.strip()) < 50:
-        logger.warning("Extracted text is too short or empty")
         return None, None
-    text = text.strip()
-    metadata.update({
-        'word_count': len(text.split()),
-        'char_count': len(text)
-    })
     return text, metadata
@@ -508,8 +553,8 @@ def analyze_document(student_name: str, student_id: str, file_obj) -> Tuple:
     logger.info(f"Starting analysis for {student_name} ({student_id})")
     try:
-        # Extract text and metadata
-        result = extract_text(file_obj)
         if result is None or result[0] is None:
             return "❌ Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None, None

             hash_sha256.update(chunk)
     return hash_sha256.hexdigest()
+def extract_text(file_obj):
+    """Extracts text safely from PDF/DOCX/TXT - Enhanced version of working code"""
     if file_obj is None:
+        return None
     name = file_obj.name
     ext = os.path.splitext(name)[1].lower()
     # Copy to temp file preserving extension
     with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
         shutil.copy(file_obj.name, tmp.name)
         tmp_path = tmp.name
     try:
         if ext == ".pdf":
             with pdfplumber.open(tmp_path) as pdf:
                 text = " ".join(page.extract_text() or "" for page in pdf.pages)
         elif ext == ".docx":
             doc = docx.Document(tmp_path)
             text = " ".join(p.text for p in doc.paragraphs)
         elif ext == ".txt":
             with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
                 text = f.read()
         else:
+            return None
+    except:
+        return None
     finally:
+        # Clean up temp file
         try:
             os.unlink(tmp_path)
         except:
             pass
+    return text.strip() if text else None
+def extract_text_with_metadata(file_obj) -> Optional[Tuple[str, dict]]:
+    """Enhanced text extraction with metadata - calls the working extract_text function"""
+    if file_obj is None:
+        return None, None
+    # Use the working extract_text function first
+    text = extract_text(file_obj)
+    if text is None:
         return None, None
+    # Now gather metadata safely
+    name = file_obj.name
+    ext = os.path.splitext(name)[1].lower()
+    # Create temporary file again for metadata extraction
+    with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
+        shutil.copy(file_obj.name, tmp.name)
+        tmp_path = tmp.name
+    try:
+        metadata = {
+            'file_type': ext,
+            'file_size': os.path.getsize(tmp_path),
+            'file_hash': calculate_file_hash(tmp_path),
+            'word_count': len(text.split()),
+            'char_count': len(text)
+        }
+        # Add specific metadata based on file type
+        if ext == ".pdf":
+            try:
+                with pdfplumber.open(tmp_path) as pdf:
+                    metadata['page_count'] = len(pdf.pages)
+            except:
+                metadata['page_count'] = 'Unknown'
+        elif ext == ".docx":
+            try:
+                doc = docx.Document(tmp_path)
+                metadata['paragraph_count'] = len(doc.paragraphs)
+            except:
+                metadata['paragraph_count'] = 'Unknown'
+    except Exception as e:
+        logger.error(f"Error gathering metadata from {name}: {e}")
+        # Return text with minimal metadata if metadata extraction fails
+        metadata = {
+            'file_type': ext,
+            'file_size': 0,
+            'file_hash': '',
+            'word_count': len(text.split()),
+            'char_count': len(text)
+        }
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass
+    # Final validation
+    if len(text.strip()) < 50:
+        logger.warning("Extracted text is too short for meaningful analysis")
+        return None, None
     return text, metadata
     logger.info(f"Starting analysis for {student_name} ({student_id})")
     try:
+        # Extract text and metadata using the working function
+        result = extract_text_with_metadata(file_obj)
         if result is None or result[0] is None:
             return "❌ Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None, None