Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
from imports import *
|
|
|
|
| 3 |
login(token="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")
|
| 4 |
|
| 5 |
|
|
@@ -54,16 +55,98 @@ def sentiment(sent: str):
|
|
| 54 |
return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
|
| 55 |
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
with gr.Blocks() as demo:
|
|
@@ -73,15 +156,19 @@ with gr.Blocks() as demo:
|
|
| 73 |
text_output = gr.Textbox(label="Result:")
|
| 74 |
text_button = gr.Button("Predict")
|
| 75 |
with gr.Tab("Extract infomation from resume"):
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
# with gr.Accordion("Open for More!"):
|
| 82 |
# gr.Markdown("Look at me...")
|
| 83 |
|
| 84 |
text_button.click(sentiment, inputs=text_input, outputs=text_output)
|
| 85 |
-
|
|
|
|
| 86 |
|
| 87 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from imports import *
|
| 3 |
+
from parse_info import *
|
| 4 |
login(token="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")
|
| 5 |
|
| 6 |
|
|
|
|
| 55 |
return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
|
| 56 |
|
| 57 |
|
| 58 |
+
processor = transformers.AutoProcessor.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label", use_auth_token=True, apply_ocr=False)
|
| 59 |
+
model = transformers.LayoutLMv3ForTokenClassification.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label").to(device)
|
| 60 |
+
label_list = ['person_name', 'dob_key', 'dob_value', 'gender_key', 'gender_value', 'phonenumber_key', 'phonenumber_value', 'email_key', 'email_value',
|
| 61 |
+
'address_key', 'address_value', 'socical_address_value', 'education', 'education_name', 'education_time', 'experience', 'experience_name',
|
| 62 |
+
'experience_time', 'information', 'undefined', 'designation_key', 'designation_value', 'degree_key', 'degree_value', 'skill_key', 'skill_value']
|
| 63 |
+
id2label = {0: 'person_name', 1: 'dob_key', 2: 'dob_value', 3: 'gender_key', 4: 'gender_value', 5: 'phonenumber_key', 6: 'phonenumber_value',
|
| 64 |
+
7: 'email_key', 8: 'email_value', 9: 'address_key', 10: 'address_value', 11: 'socical_address_value', 12: 'education', 13: 'education_name',
|
| 65 |
+
14: 'education_time', 15: 'experience', 16: 'experience_name', 17: 'experience_time', 18: 'information', 19: 'undefined', 20: 'designation_key',
|
| 66 |
+
21: 'designation_value', 22: 'degree_key', 23: 'degree_value', 24: 'skill_key', 25: 'skill_value'}
|
| 67 |
+
key_list = ["person_name","dob_value","gender_value","phonenumber_value","email_value","address_value",
|
| 68 |
+
"socical_address_value","education_name","education_time","experience_name","experience_time",
|
| 69 |
+
"designation_value","degree_value","skill_value"]
|
| 70 |
+
label2id = {v: k for k, v in id2label.items()}
|
| 71 |
+
def pred_resume(pdf_path) -> dict:
|
| 72 |
+
global key_list, device
|
| 73 |
+
result = {}
|
| 74 |
+
for i in key_list:
|
| 75 |
+
result[i] = []
|
| 76 |
+
DPI = 200/77
|
| 77 |
+
global label_list, id2label, label2id
|
| 78 |
|
| 79 |
+
# read pdf, convert to img
|
| 80 |
+
doc = fitz.open(pdf_path.name)
|
| 81 |
+
num_pages = len(doc)
|
| 82 |
+
images = pdf2image.convert_from_path(pdf_path)
|
| 83 |
+
block_dict = {}
|
| 84 |
+
|
| 85 |
+
# get all data in pdf
|
| 86 |
+
page_num = 1
|
| 87 |
+
for page in doc:
|
| 88 |
+
file_dict = page.get_text('dict')
|
| 89 |
+
block = file_dict['blocks']
|
| 90 |
+
block_dict[page_num] = block
|
| 91 |
+
page_num += 1
|
| 92 |
+
|
| 93 |
+
# predict each page in pdf
|
| 94 |
+
for page_num, blocks in block_dict.items():
|
| 95 |
+
bboxes, words = [], [] # store bounding boxes, text in a page
|
| 96 |
+
image = images[page_num-1]
|
| 97 |
+
for block in blocks:
|
| 98 |
+
if block['type'] == 0:
|
| 99 |
+
for line in block['lines']:
|
| 100 |
+
for span in line['spans']:
|
| 101 |
+
xmin, ymin, xmax, ymax = [int(i)*DPI for i in list(span['bbox'])]
|
| 102 |
+
text = unidecode(span['text']).strip()
|
| 103 |
+
if text.replace(" ","") != "":
|
| 104 |
+
bboxes.append(normalize_bbox([xmin, ymin, xmax, ymax], image.size))
|
| 105 |
+
words.append(decontracted(text))
|
| 106 |
+
fake_label = ["O"] * len(words)
|
| 107 |
+
encoding = processor(image, words, boxes=bboxes, word_labels=fake_label, truncation=True, stride=256,
|
| 108 |
+
padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True)
|
| 109 |
+
labels = encoding["labels"]
|
| 110 |
+
offset_mapping = encoding.pop('offset_mapping')
|
| 111 |
+
overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
|
| 112 |
+
encoding = {k: torch.tensor(v) for k,v in encoding.items() if k != "labels"}
|
| 113 |
+
x = []
|
| 114 |
+
for i in range(0, len(encoding['pixel_values'])):
|
| 115 |
+
x.append(encoding['pixel_values'][i])
|
| 116 |
+
x = torch.stack(x)
|
| 117 |
+
encoding['pixel_values'] = x
|
| 118 |
+
|
| 119 |
+
# forawrd to model
|
| 120 |
+
with torch.no_grad():
|
| 121 |
+
outputs = model(**{k: v.to(device) for k,v in encoding.items() if k != "labels"})
|
| 122 |
+
|
| 123 |
+
# process output
|
| 124 |
+
predictions = outputs["logits"].argmax(-1).squeeze().tolist()
|
| 125 |
+
if outputs["logits"].shape[0] > 1:
|
| 126 |
+
for i, label in enumerate(labels):
|
| 127 |
+
if i>0:
|
| 128 |
+
labels[i] = labels[i][256:]
|
| 129 |
+
predictions[i] = predictions[i][256:]
|
| 130 |
+
predictions = [j for i in predictions for j in i]
|
| 131 |
+
labels = [j for i in labels for j in i]
|
| 132 |
+
true_predictions = [id2label[pred] for pred, label in zip(predictions, labels) if label != -100]
|
| 133 |
+
for i, pred in enumerate(true_predictions):
|
| 134 |
+
if pred in key_list:
|
| 135 |
+
result[pred].append(words[i])
|
| 136 |
+
return str(result)
|
| 137 |
+
def norm(result: str) -> str:
|
| 138 |
+
result = ast.literal_eval(result)
|
| 139 |
+
result["person_name"] = " ".join([parse_string(i).capitalize() for i in " ".join(result["person_name"]).split()])
|
| 140 |
+
result["email_value"] = parse_email(result["email_value"])
|
| 141 |
+
result["phonenumber_value"] = "".join([i for i in "".join(result["phonenumber_value"]) if i.isdigit()])
|
| 142 |
+
result["address_value"] = parse_address(result["address_value"])
|
| 143 |
+
result["designation_value"] = parse_designation(result["designation_value"])
|
| 144 |
+
result["experience_time"] = parse_time(result["experience_time"])
|
| 145 |
+
result["gender_value"] = parse_gender(result["gender_value"])
|
| 146 |
+
result["skill_value"] = parse_skill(result["skill_value"])
|
| 147 |
+
result["education_name"] = parse_designation(result["education_name"])
|
| 148 |
+
result["experience_name"] = parse_designation(result["experience_name"])
|
| 149 |
+
return str(result)
|
| 150 |
|
| 151 |
|
| 152 |
with gr.Blocks() as demo:
|
|
|
|
| 156 |
text_output = gr.Textbox(label="Result:")
|
| 157 |
text_button = gr.Button("Predict")
|
| 158 |
with gr.Tab("Extract infomation from resume"):
|
| 159 |
+
with gr.Row():
|
| 160 |
+
file_input = gr.File(label="Upload pdf", file_types=[".pdf"])
|
| 161 |
+
cv_output = gr.Textbox(label="Information fields")
|
| 162 |
+
resume_button = gr.Button("Extract")
|
| 163 |
+
with gr.Row():
|
| 164 |
+
normalize_output = gr.Textbox(label="Normalize by rule-based:")
|
| 165 |
+
normalize_button = gr.Button("Normailze")
|
| 166 |
|
| 167 |
# with gr.Accordion("Open for More!"):
|
| 168 |
# gr.Markdown("Look at me...")
|
| 169 |
|
| 170 |
text_button.click(sentiment, inputs=text_input, outputs=text_output)
|
| 171 |
+
resume_button.click(pred_resume, inputs=file_input, outputs=cv_output)
|
| 172 |
+
normalize_button.click(norm, inputs=cv_output, outputs=normalize_output)
|
| 173 |
|
| 174 |
demo.launch()
|