Spaces:
Runtime error
Runtime error
Update utility/utils.py
Browse files- utility/utils.py +30 -31
utility/utils.py
CHANGED
|
@@ -43,19 +43,6 @@ HFT = os.getenv('HF_TOKEN')
|
|
| 43 |
# Initialize the InferenceClient
|
| 44 |
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
|
| 45 |
|
| 46 |
-
# Specify a custom model storage directory (ensure this path is writable)
|
| 47 |
-
#model_storage_directory = '/app/models'
|
| 48 |
-
|
| 49 |
-
# Create the reader object and set the model storage directory
|
| 50 |
-
#reader = easyocr.Reader(['en'], model_storage_directory=model_storage_directory)
|
| 51 |
-
|
| 52 |
-
def draw_boxes(image, bounds, color='red', width=2):
|
| 53 |
-
draw = ImageDraw.Draw(image)
|
| 54 |
-
for bound in bounds:
|
| 55 |
-
p0, p1, p2, p3 = bound[0]
|
| 56 |
-
draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
|
| 57 |
-
return image
|
| 58 |
-
|
| 59 |
# Load image using OpenCV
|
| 60 |
def load_image(image_path):
|
| 61 |
image = cv2.imread(image_path)
|
|
@@ -108,11 +95,10 @@ def process_image(image_path, scale=2):
|
|
| 108 |
|
| 109 |
return final_image
|
| 110 |
|
|
|
|
| 111 |
def ocr_with_paddle(img):
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
#ocr = PaddleOCR(lang='en', use_angle_cls=True, det_model_dir=model_dir)
|
| 115 |
-
#ocr = PaddleOCR(lang='en', use_angle_cls=True, det_model_dir=os.environ['PADDLEOCR_HOME'])
|
| 116 |
logging.info(f"PADDLEOCR_HOME: {os.environ['PADDLEOCR_HOME']}")
|
| 117 |
ocr = PaddleOCR(
|
| 118 |
lang='en',
|
|
@@ -123,12 +109,22 @@ def ocr_with_paddle(img):
|
|
| 123 |
)
|
| 124 |
|
| 125 |
result = ocr.ocr(img)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
| 131 |
|
|
|
|
| 132 |
def extract_text_from_images(image_paths):
|
| 133 |
all_extracted_texts = {}
|
| 134 |
all_extracted_imgs = {}
|
|
@@ -137,15 +133,18 @@ def extract_text_from_images(image_paths):
|
|
| 137 |
# Enhance the image before OCR
|
| 138 |
enhanced_image = process_image(image_path, scale=2)
|
| 139 |
|
| 140 |
-
#
|
|
|
|
|
|
|
|
|
|
| 141 |
img_result = Image.fromarray(enhanced_image)
|
|
|
|
| 142 |
|
|
|
|
| 143 |
result_image_path = os.path.join(RESULT_FOLDER, f'result_{os.path.basename(image_path)}')
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
# Perform OCR on the enhanced image
|
| 147 |
-
result = ocr_with_paddle(enhanced_image)
|
| 148 |
|
|
|
|
| 149 |
all_extracted_texts[image_path] = result
|
| 150 |
all_extracted_imgs[image_path] = result_image_path
|
| 151 |
except ValueError as ve:
|
|
@@ -318,14 +317,14 @@ def extract_contact_details(text):
|
|
| 318 |
# Email regex
|
| 319 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
| 320 |
|
| 321 |
-
#
|
| 322 |
-
#link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?(?:linkedin\.com|github\.com|indeed\.com|[A-Za-z0-9-]+\.[A-Za-z]{2,})[\w./?-]*\b')
|
| 323 |
-
#link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.(?:[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?(?:\.[a-zA-Z]{2,})?(?:[/\w.-]*)*[\w/?&=-]*\b')
|
| 324 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
|
|
|
| 325 |
# Find all matches in the text
|
| 326 |
phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
|
| 327 |
-
|
| 328 |
emails = email_regex.findall(text)
|
|
|
|
| 329 |
links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
|
| 330 |
|
| 331 |
# Remove profile links that might conflict with emails
|
|
@@ -385,4 +384,4 @@ def process_resume_data(LLMdata,cont_data,extracted_text):
|
|
| 385 |
processed_data['email'].extend(cont_data.get("emails", []))
|
| 386 |
processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
| 387 |
processed_data['Link'].extend(cont_data.get("links_RE", []))
|
| 388 |
-
return processed_data
|
|
|
|
| 43 |
# Initialize the InferenceClient
|
| 44 |
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
# Load image using OpenCV
|
| 47 |
def load_image(image_path):
|
| 48 |
image = cv2.imread(image_path)
|
|
|
|
| 95 |
|
| 96 |
return final_image
|
| 97 |
|
| 98 |
+
# Function for OCR with PaddleOCR, returning both text and bounding boxes
|
| 99 |
def ocr_with_paddle(img):
|
| 100 |
+
final_text = ''
|
| 101 |
+
|
|
|
|
|
|
|
| 102 |
logging.info(f"PADDLEOCR_HOME: {os.environ['PADDLEOCR_HOME']}")
|
| 103 |
ocr = PaddleOCR(
|
| 104 |
lang='en',
|
|
|
|
| 109 |
)
|
| 110 |
|
| 111 |
result = ocr.ocr(img)
|
| 112 |
+
boxes = []
|
| 113 |
+
for line in result[0]:
|
| 114 |
+
box, text, _ = line
|
| 115 |
+
boxes.append(box) # Append the bounding box
|
| 116 |
+
final_text += ' ' + text
|
| 117 |
+
|
| 118 |
+
return final_text, boxes
|
| 119 |
|
| 120 |
+
# Function to draw bounding boxes around text
|
| 121 |
+
def draw_boxes(image, boxes):
|
| 122 |
+
draw = ImageDraw.Draw(image)
|
| 123 |
+
for box in boxes:
|
| 124 |
+
draw.polygon(box, outline="red", width=3)
|
| 125 |
+
return image
|
| 126 |
|
| 127 |
+
# Extract text and create a result image with bounding boxes
|
| 128 |
def extract_text_from_images(image_paths):
|
| 129 |
all_extracted_texts = {}
|
| 130 |
all_extracted_imgs = {}
|
|
|
|
| 133 |
# Enhance the image before OCR
|
| 134 |
enhanced_image = process_image(image_path, scale=2)
|
| 135 |
|
| 136 |
+
# Perform OCR on the enhanced image and get boxes
|
| 137 |
+
result, boxes = ocr_with_paddle(enhanced_image)
|
| 138 |
+
|
| 139 |
+
# Draw bounding boxes on the processed image
|
| 140 |
img_result = Image.fromarray(enhanced_image)
|
| 141 |
+
img_with_boxes = draw_boxes(img_result, boxes)
|
| 142 |
|
| 143 |
+
# Save the image with boxes
|
| 144 |
result_image_path = os.path.join(RESULT_FOLDER, f'result_{os.path.basename(image_path)}')
|
| 145 |
+
img_with_boxes.save(result_image_path)
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
# Store the text and image result paths
|
| 148 |
all_extracted_texts[image_path] = result
|
| 149 |
all_extracted_imgs[image_path] = result_image_path
|
| 150 |
except ValueError as ve:
|
|
|
|
| 317 |
# Email regex
|
| 318 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
| 319 |
|
| 320 |
+
# URL and links regex, updated to avoid conflicts with email domains
|
|
|
|
|
|
|
| 321 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
| 322 |
+
|
| 323 |
# Find all matches in the text
|
| 324 |
phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
|
| 325 |
+
|
| 326 |
emails = email_regex.findall(text)
|
| 327 |
+
|
| 328 |
links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
|
| 329 |
|
| 330 |
# Remove profile links that might conflict with emails
|
|
|
|
| 384 |
processed_data['email'].extend(cont_data.get("emails", []))
|
| 385 |
processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
| 386 |
processed_data['Link'].extend(cont_data.get("links_RE", []))
|
| 387 |
+
return processed_data
|