anoopreddyyeddula commited on
Commit
f1dff19
·
1 Parent(s): f1351ad

fix: resolve merge conflicts and update UI

Browse files
Files changed (1) hide show
  1. app.py +92 -21
app.py CHANGED
@@ -10,15 +10,61 @@ import pandas as pd
10
  import logging
11
  from datetime import datetime
12
  import os
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Basic logging setup
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
- # Initialize models
19
- reader = easyocr.Reader(['en'])
20
- text_classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
21
- doc_classifier = pipeline("image-classification", model="microsoft/donut-base-finetuned-rvlcdip") # Better for document classification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def validate_insurance_claim(text):
24
  """Validate if the text contains insurance claim related content"""
@@ -35,34 +81,59 @@ def process_document(file):
35
 
36
  # Handle PDF files
37
  if file_extension == '.pdf':
38
- images = pdf2image.convert_from_bytes(file.read(), first_page=1, last_page=1)
39
- if not images:
40
- return "Failed to process insurance claim PDF", None, None
41
- image = images[0]
 
 
 
 
 
42
  # Handle image files
43
  elif file_extension in ('.png', '.jpg', '.jpeg'):
44
- image = Image.open(file)
 
 
 
 
45
  else:
46
  return "Unsupported file format. Please upload PDF or image files.", None, None
47
 
48
- # Extract text
49
- result = reader.readtext(np.array(image))
50
- text = ' '.join([t[1] for t in result])
51
 
52
- # Format the extracted text more clearly
 
 
 
 
 
 
 
 
53
  formatted_text = format_insurance_claim(text)
54
 
55
  # Validate if it's an insurance claim
56
  if not validate_insurance_claim(text):
57
  return "Document does not appear to be an insurance claim", None, None
58
 
59
- # Classify text sentiment/validity
60
- text_analysis = text_classifier(text[:512])[0]
61
-
62
- # Classify document type
63
- doc_analysis = doc_classifier(image)[0]
 
 
 
 
 
 
 
 
64
 
65
- # Generate validation results with more detailed analysis
66
  validation_result = analyze_claim_validity(text_analysis['score'])
67
 
68
  return (
@@ -76,8 +147,8 @@ def process_document(file):
76
  )
77
 
78
  except Exception as e:
79
- logger.error(f"Error processing insurance claim: {str(e)}")
80
- return f"Error processing claim: {str(e)}", None, None
81
 
82
  def format_insurance_claim(text):
83
  """Format the extracted text in a more readable way"""
 
10
  import logging
11
  from datetime import datetime
12
  import os
13
+ import torch
14
+
15
+ # Add these near the top of your script, after imports
16
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
17
+ torch.backends.cudnn.benchmark = True
18
+
19
+ # If you're running out of memory, uncomment these lines:
20
+ # import gc
21
+ # gc.collect()
22
+ # torch.cuda.empty_cache()
23
 
24
  # Basic logging setup
25
  logging.basicConfig(level=logging.INFO)
26
  logger = logging.getLogger(__name__)
27
 
28
+ # Initialize models with specific device placement and lower precision
29
+ device = 0 if torch.cuda.is_available() else -1
30
+ logger.info(f"Using device: {'CUDA' if device == 0 else 'CPU'}")
31
+
32
+ # Initialize models with memory optimization
33
+ def init_models():
34
+ try:
35
+ # Initialize EasyOCR with lower memory usage
36
+ reader = easyocr.Reader(['en'], gpu=bool(device == 0),
37
+ model_storage_directory='./models',
38
+ download_enabled=True)
39
+
40
+ # Initialize text classifier with optimizations
41
+ text_classifier = pipeline(
42
+ "text-classification",
43
+ model="distilbert-base-uncased-finetuned-sst-2-english",
44
+ device=device,
45
+ model_kwargs={"low_cpu_mem_usage": True}
46
+ )
47
+
48
+ # Use a more lightweight document classifier
49
+ doc_classifier = pipeline(
50
+ "image-classification",
51
+ model="microsoft/dit-base-finetuned-rvlcdip",
52
+ device=device,
53
+ model_kwargs={"low_cpu_mem_usage": True}
54
+ )
55
+
56
+ return reader, text_classifier, doc_classifier
57
+ except Exception as e:
58
+ logger.error(f"Error initializing models: {str(e)}")
59
+ raise
60
+
61
+ try:
62
+ logger.info("Initializing models...")
63
+ reader, text_classifier, doc_classifier = init_models()
64
+ logger.info("Models initialized successfully")
65
+ except Exception as e:
66
+ logger.error(f"Failed to initialize models: {str(e)}")
67
+ raise
68
 
69
  def validate_insurance_claim(text):
70
  """Validate if the text contains insurance claim related content"""
 
81
 
82
  # Handle PDF files
83
  if file_extension == '.pdf':
84
+ try:
85
+ images = pdf2image.convert_from_bytes(file.read(), first_page=1, last_page=1)
86
+ if not images:
87
+ return "Failed to process insurance claim PDF", None, None
88
+ image = images[0]
89
+ except Exception as e:
90
+ logger.error(f"PDF processing error: {str(e)}")
91
+ return "Error processing PDF file", None, None
92
+
93
  # Handle image files
94
  elif file_extension in ('.png', '.jpg', '.jpeg'):
95
+ try:
96
+ image = Image.open(file)
97
+ except Exception as e:
98
+ logger.error(f"Image processing error: {str(e)}")
99
+ return "Error processing image file", None, None
100
  else:
101
  return "Unsupported file format. Please upload PDF or image files.", None, None
102
 
103
+ # Convert image to RGB if necessary
104
+ if image.mode != 'RGB':
105
+ image = image.convert('RGB')
106
 
107
+ # Extract text with error handling
108
+ try:
109
+ result = reader.readtext(np.array(image))
110
+ text = ' '.join([t[1] for t in result])
111
+ except Exception as e:
112
+ logger.error(f"Text extraction error: {str(e)}")
113
+ return "Error extracting text from document", None, None
114
+
115
+ # Format the extracted text
116
  formatted_text = format_insurance_claim(text)
117
 
118
  # Validate if it's an insurance claim
119
  if not validate_insurance_claim(text):
120
  return "Document does not appear to be an insurance claim", None, None
121
 
122
+ # Classify text with error handling
123
+ try:
124
+ text_analysis = text_classifier(text[:512])[0]
125
+ except Exception as e:
126
+ logger.error(f"Text classification error: {str(e)}")
127
+ text_analysis = {'score': 0.5}
128
+
129
+ # Classify document with error handling
130
+ try:
131
+ doc_analysis = doc_classifier(image)[0]
132
+ except Exception as e:
133
+ logger.error(f"Document classification error: {str(e)}")
134
+ doc_analysis = {'score': 0.5}
135
 
136
+ # Generate validation results
137
  validation_result = analyze_claim_validity(text_analysis['score'])
138
 
139
  return (
 
147
  )
148
 
149
  except Exception as e:
150
+ logger.error(f"General processing error: {str(e)}")
151
+ return "Error processing document", None, None
152
 
153
  def format_insurance_claim(text):
154
  """Format the extracted text in a more readable way"""