Spaces:

anoopreddyyeddula
/

Automated-Insurance-Claim-Validation-System

Sleeping

App Files Files Community

anoopreddyyeddula commited on Apr 27

Commit

f1dff19

1 Parent(s): f1351ad

fix: resolve merge conflicts and update UI

Browse files

Files changed (1) hide show

app.py +92 -21

app.py CHANGED Viewed

@@ -10,15 +10,61 @@ import pandas as pd
 import logging
 from datetime import datetime
 import os
 # Basic logging setup
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Initialize models
-reader = easyocr.Reader(['en'])
-text_classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
-doc_classifier = pipeline("image-classification", model="microsoft/donut-base-finetuned-rvlcdip")  # Better for document classification
 def validate_insurance_claim(text):
     """Validate if the text contains insurance claim related content"""
@@ -35,34 +81,59 @@ def process_document(file):
         # Handle PDF files
         if file_extension == '.pdf':
-            images = pdf2image.convert_from_bytes(file.read(), first_page=1, last_page=1)
-            if not images:
-                return "Failed to process insurance claim PDF", None, None
-            image = images[0]
         # Handle image files
         elif file_extension in ('.png', '.jpg', '.jpeg'):
-            image = Image.open(file)
         else:
             return "Unsupported file format. Please upload PDF or image files.", None, None
-        # Extract text
-        result = reader.readtext(np.array(image))
-        text = ' '.join([t[1] for t in result])
-        # Format the extracted text more clearly
         formatted_text = format_insurance_claim(text)
         # Validate if it's an insurance claim
         if not validate_insurance_claim(text):
             return "Document does not appear to be an insurance claim", None, None
-        # Classify text sentiment/validity
-        text_analysis = text_classifier(text[:512])[0]
-        # Classify document type
-        doc_analysis = doc_classifier(image)[0]
-        # Generate validation results with more detailed analysis
         validation_result = analyze_claim_validity(text_analysis['score'])
         return (
@@ -76,8 +147,8 @@ def process_document(file):
         )
     except Exception as e:
-        logger.error(f"Error processing insurance claim: {str(e)}")
-        return f"Error processing claim: {str(e)}", None, None
 def format_insurance_claim(text):
     """Format the extracted text in a more readable way"""

 import logging
 from datetime import datetime
 import os
+import torch
+# Add these near the top of your script, after imports
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
+torch.backends.cudnn.benchmark = True
+# If you're running out of memory, uncomment these lines:
+# import gc
+# gc.collect()
+# torch.cuda.empty_cache()
 # Basic logging setup
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Initialize models with specific device placement and lower precision
+device = 0 if torch.cuda.is_available() else -1
+logger.info(f"Using device: {'CUDA' if device == 0 else 'CPU'}")
+# Initialize models with memory optimization
+def init_models():
+    try:
+        # Initialize EasyOCR with lower memory usage
+        reader = easyocr.Reader(['en'], gpu=bool(device == 0),
+                              model_storage_directory='./models',
+                              download_enabled=True)
+        # Initialize text classifier with optimizations
+        text_classifier = pipeline(
+            "text-classification",
+            model="distilbert-base-uncased-finetuned-sst-2-english",
+            device=device,
+            model_kwargs={"low_cpu_mem_usage": True}
+        )
+        # Use a more lightweight document classifier
+        doc_classifier = pipeline(
+            "image-classification",
+            model="microsoft/dit-base-finetuned-rvlcdip",
+            device=device,
+            model_kwargs={"low_cpu_mem_usage": True}
+        )
+        return reader, text_classifier, doc_classifier
+    except Exception as e:
+        logger.error(f"Error initializing models: {str(e)}")
+        raise
+try:
+    logger.info("Initializing models...")
+    reader, text_classifier, doc_classifier = init_models()
+    logger.info("Models initialized successfully")
+except Exception as e:
+    logger.error(f"Failed to initialize models: {str(e)}")
+    raise
 def validate_insurance_claim(text):
     """Validate if the text contains insurance claim related content"""
         # Handle PDF files
         if file_extension == '.pdf':
+            try:
+                images = pdf2image.convert_from_bytes(file.read(), first_page=1, last_page=1)
+                if not images:
+                    return "Failed to process insurance claim PDF", None, None
+                image = images[0]
+            except Exception as e:
+                logger.error(f"PDF processing error: {str(e)}")
+                return "Error processing PDF file", None, None
         # Handle image files
         elif file_extension in ('.png', '.jpg', '.jpeg'):
+            try:
+                image = Image.open(file)
+            except Exception as e:
+                logger.error(f"Image processing error: {str(e)}")
+                return "Error processing image file", None, None
         else:
             return "Unsupported file format. Please upload PDF or image files.", None, None
+        # Convert image to RGB if necessary
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Extract text with error handling
+        try:
+            result = reader.readtext(np.array(image))
+            text = ' '.join([t[1] for t in result])
+        except Exception as e:
+            logger.error(f"Text extraction error: {str(e)}")
+            return "Error extracting text from document", None, None
+        # Format the extracted text
         formatted_text = format_insurance_claim(text)
         # Validate if it's an insurance claim
         if not validate_insurance_claim(text):
             return "Document does not appear to be an insurance claim", None, None
+        # Classify text with error handling
+        try:
+            text_analysis = text_classifier(text[:512])[0]
+        except Exception as e:
+            logger.error(f"Text classification error: {str(e)}")
+            text_analysis = {'score': 0.5}
+        # Classify document with error handling
+        try:
+            doc_analysis = doc_classifier(image)[0]
+        except Exception as e:
+            logger.error(f"Document classification error: {str(e)}")
+            doc_analysis = {'score': 0.5}
+        # Generate validation results
         validation_result = analyze_claim_validity(text_analysis['score'])
         return (
         )
     except Exception as e:
+        logger.error(f"General processing error: {str(e)}")
+        return "Error processing document", None, None
 def format_insurance_claim(text):
     """Format the extracted text in a more readable way"""