Spaces:
Runtime error
Runtime error
Update utility/utils.py
Browse files- utility/utils.py +16 -6
utility/utils.py
CHANGED
|
@@ -5,6 +5,7 @@ from dotenv import load_dotenv
|
|
| 5 |
import json
|
| 6 |
import re
|
| 7 |
import easyocr
|
|
|
|
| 8 |
from PIL import Image, ImageEnhance, ImageDraw
|
| 9 |
import cv2
|
| 10 |
import numpy as np
|
|
@@ -22,6 +23,9 @@ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
|
|
| 22 |
# Initialize EasyOCR reader for extracting text
|
| 23 |
reader = easyocr.Reader(['en'])
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
def draw_boxes(image, bounds, color='red', width=2):
|
| 26 |
draw = ImageDraw.Draw(image)
|
| 27 |
for bound in bounds:
|
|
@@ -95,11 +99,12 @@ def ocr_with_paddle(img):
|
|
| 95 |
return finaltext
|
| 96 |
|
| 97 |
|
| 98 |
-
def extract_text_from_images(image_paths):
|
| 99 |
all_extracted_texts = {}
|
| 100 |
all_extracted_imgs={}
|
| 101 |
for image_path in image_paths:
|
| 102 |
# Enhance the image before OCR
|
|
|
|
| 103 |
enhanced_image = process_image(image_path, scale=2)
|
| 104 |
bounds = reader.readtext(enhanced_image)
|
| 105 |
# Draw boxes on the processed image
|
|
@@ -111,15 +116,18 @@ def extract_text_from_images(image_paths):
|
|
| 111 |
|
| 112 |
# Perform OCR on the enhanced image
|
| 113 |
result=ocr_with_paddle(enhanced_image)
|
| 114 |
-
|
|
|
|
|
|
|
| 115 |
all_extracted_texts[image_path] =result
|
| 116 |
all_extracted_imgs[image_path] = result_image_path
|
| 117 |
# Convert to JSON-compatible structure
|
| 118 |
all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
|
| 119 |
return all_extracted_texts,all_extracted_imgs_json
|
| 120 |
|
|
|
|
| 121 |
# Function to call the Gemma model and process the output as Json
|
| 122 |
-
def Data_Extractor(data, client):
|
| 123 |
text = f'''Act as a Text extractor for the following text given in text: {data}
|
| 124 |
extract text in the following output JSON string:
|
| 125 |
{{
|
|
@@ -134,7 +142,7 @@ def Data_Extractor(data, client):
|
|
| 134 |
Output:
|
| 135 |
'''
|
| 136 |
# Call the API for inference
|
| 137 |
-
response = client.text_generation(text, max_new_tokens=600)
|
| 138 |
|
| 139 |
print("parse in text ---:",response)
|
| 140 |
|
|
@@ -280,9 +288,10 @@ def extract_contact_details(text):
|
|
| 280 |
# Email regex
|
| 281 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
| 282 |
|
| 283 |
-
#
|
|
|
|
|
|
|
| 284 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
| 285 |
-
|
| 286 |
# Find all matches in the text
|
| 287 |
phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
|
| 288 |
print("phone_numbers--->",phone_numbers)
|
|
@@ -320,6 +329,7 @@ def process_extracted_text(extracted_text):
|
|
| 320 |
combined_results["links_RE"].extend(contact_details["links_RE"])
|
| 321 |
|
| 322 |
# Convert the combined results to JSON
|
|
|
|
| 323 |
combined_results_json = combined_results
|
| 324 |
|
| 325 |
# Print the final JSON results
|
|
|
|
| 5 |
import json
|
| 6 |
import re
|
| 7 |
import easyocr
|
| 8 |
+
import spacy
|
| 9 |
from PIL import Image, ImageEnhance, ImageDraw
|
| 10 |
import cv2
|
| 11 |
import numpy as np
|
|
|
|
| 23 |
# Initialize EasyOCR reader for extracting text
|
| 24 |
reader = easyocr.Reader(['en'])
|
| 25 |
|
| 26 |
+
# Initialize spaCy's English model
|
| 27 |
+
nlp = spacy.load("en_core_web_sm")
|
| 28 |
+
|
| 29 |
def draw_boxes(image, bounds, color='red', width=2):
|
| 30 |
draw = ImageDraw.Draw(image)
|
| 31 |
for bound in bounds:
|
|
|
|
| 99 |
return finaltext
|
| 100 |
|
| 101 |
|
| 102 |
+
def extract_text_from_images(image_paths, RESULT_FOLDER):
|
| 103 |
all_extracted_texts = {}
|
| 104 |
all_extracted_imgs={}
|
| 105 |
for image_path in image_paths:
|
| 106 |
# Enhance the image before OCR
|
| 107 |
+
#enhanced_image = load_image(image_path)
|
| 108 |
enhanced_image = process_image(image_path, scale=2)
|
| 109 |
bounds = reader.readtext(enhanced_image)
|
| 110 |
# Draw boxes on the processed image
|
|
|
|
| 116 |
|
| 117 |
# Perform OCR on the enhanced image
|
| 118 |
result=ocr_with_paddle(enhanced_image)
|
| 119 |
+
# results = reader.readtext(enhanced_image)
|
| 120 |
+
# extracted_text = " ".join([res[1] for res in results])
|
| 121 |
+
|
| 122 |
all_extracted_texts[image_path] =result
|
| 123 |
all_extracted_imgs[image_path] = result_image_path
|
| 124 |
# Convert to JSON-compatible structure
|
| 125 |
all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
|
| 126 |
return all_extracted_texts,all_extracted_imgs_json
|
| 127 |
|
| 128 |
+
|
| 129 |
# Function to call the Gemma model and process the output as Json
|
| 130 |
+
def Data_Extractor(data, client=client):
|
| 131 |
text = f'''Act as a Text extractor for the following text given in text: {data}
|
| 132 |
extract text in the following output JSON string:
|
| 133 |
{{
|
|
|
|
| 142 |
Output:
|
| 143 |
'''
|
| 144 |
# Call the API for inference
|
| 145 |
+
response = client.text_generation(text, max_new_tokens=600)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
|
| 146 |
|
| 147 |
print("parse in text ---:",response)
|
| 148 |
|
|
|
|
| 288 |
# Email regex
|
| 289 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
| 290 |
|
| 291 |
+
# Profile links regex, updated to avoid conflicts with email domains
|
| 292 |
+
#link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?(?:linkedin\.com|github\.com|indeed\.com|[A-Za-z0-9-]+\.[A-Za-z]{2,})[\w./?-]*\b')
|
| 293 |
+
#link_regex = re.compile(r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.(?:[a-zA-Z]{2,})(?:\.[a-zA-Z]{2,})?(?:\.[a-zA-Z]{2,})?(?:[/\w.-]*)*[\w/?&=-]*\b')
|
| 294 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
|
|
|
| 295 |
# Find all matches in the text
|
| 296 |
phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
|
| 297 |
print("phone_numbers--->",phone_numbers)
|
|
|
|
| 329 |
combined_results["links_RE"].extend(contact_details["links_RE"])
|
| 330 |
|
| 331 |
# Convert the combined results to JSON
|
| 332 |
+
#combined_results_json = json.dumps(combined_results, indent=4)
|
| 333 |
combined_results_json = combined_results
|
| 334 |
|
| 335 |
# Print the final JSON results
|