Spaces:
Running
Running
Luca Foppiano
commited on
refactor grobid processors (#29)
Browse filesrefactor grobid processors, deprecate legacy methods
document_qa/document_qa_engine.py
CHANGED
|
@@ -269,7 +269,7 @@ class DocumentQAEngine:
|
|
| 269 |
print("File", pdf_file_path)
|
| 270 |
filename = Path(pdf_file_path).stem
|
| 271 |
coordinates = True # if chunk_size == -1 else False
|
| 272 |
-
structure = self.grobid_processor.
|
| 273 |
|
| 274 |
biblio = structure['biblio']
|
| 275 |
biblio['filename'] = filename.replace(" ", "_")
|
|
|
|
| 269 |
print("File", pdf_file_path)
|
| 270 |
filename = Path(pdf_file_path).stem
|
| 271 |
coordinates = True # if chunk_size == -1 else False
|
| 272 |
+
structure = self.grobid_processor.process(pdf_file_path, coordinates=coordinates)
|
| 273 |
|
| 274 |
biblio = structure['biblio']
|
| 275 |
biblio['filename'] = filename.replace(" ", "_")
|
document_qa/grobid_processors.py
CHANGED
|
@@ -2,6 +2,7 @@ import re
|
|
| 2 |
from collections import OrderedDict
|
| 3 |
from html import escape
|
| 4 |
from pathlib import Path
|
|
|
|
| 5 |
|
| 6 |
import dateparser
|
| 7 |
import grobid_tei_xml
|
|
@@ -54,6 +55,7 @@ def decorate_text_with_annotations(text, spans, tag="span"):
|
|
| 54 |
return annotated_text
|
| 55 |
|
| 56 |
|
|
|
|
| 57 |
def extract_quantities(client, x_all, column_text_index):
|
| 58 |
# relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
|
| 59 |
# "magnetic flux density", "magnetic flux"]
|
|
@@ -63,7 +65,7 @@ def extract_quantities(client, x_all, column_text_index):
|
|
| 63 |
|
| 64 |
for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
|
| 65 |
text = example[column_text_index]
|
| 66 |
-
spans = GrobidQuantitiesProcessor(client).
|
| 67 |
|
| 68 |
data_record = {
|
| 69 |
"id": example[0],
|
|
@@ -78,12 +80,13 @@ def extract_quantities(client, x_all, column_text_index):
|
|
| 78 |
return output_data
|
| 79 |
|
| 80 |
|
|
|
|
| 81 |
def extract_materials(client, x_all, column_text_index):
|
| 82 |
output_data = []
|
| 83 |
|
| 84 |
for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
|
| 85 |
text = example[column_text_index]
|
| 86 |
-
spans = GrobidMaterialsProcessor(client).
|
| 87 |
data_record = {
|
| 88 |
"id": example[0],
|
| 89 |
"filename": example[1],
|
|
@@ -131,7 +134,7 @@ class GrobidProcessor(BaseProcessor):
|
|
| 131 |
# super().__init__()
|
| 132 |
self.grobid_client = grobid_client
|
| 133 |
|
| 134 |
-
def
|
| 135 |
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
|
| 136 |
input_path,
|
| 137 |
consolidate_header=True,
|
|
@@ -145,19 +148,10 @@ class GrobidProcessor(BaseProcessor):
|
|
| 145 |
if status != 200:
|
| 146 |
return
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
| 150 |
|
| 151 |
-
return
|
| 152 |
-
|
| 153 |
-
def process_single(self, input_file):
|
| 154 |
-
doc = self.process_structure(input_file)
|
| 155 |
-
|
| 156 |
-
for paragraph in doc['passages']:
|
| 157 |
-
entities = self.process_single_text(paragraph['text'])
|
| 158 |
-
paragraph['spans'] = entities
|
| 159 |
-
|
| 160 |
-
return doc
|
| 161 |
|
| 162 |
def parse_grobid_xml(self, text, coordinates=False):
|
| 163 |
output_data = OrderedDict()
|
|
@@ -187,10 +181,10 @@ class GrobidProcessor(BaseProcessor):
|
|
| 187 |
"text": f"authors: {biblio['authors']}",
|
| 188 |
"type": passage_type,
|
| 189 |
"section": "<header>",
|
| 190 |
-
"subSection": "<
|
| 191 |
-
"passage_id": "
|
| 192 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
| 193 |
-
|
| 194 |
})
|
| 195 |
|
| 196 |
passages.append({
|
|
@@ -293,7 +287,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
|
|
| 293 |
def __init__(self, grobid_quantities_client):
|
| 294 |
self.grobid_quantities_client = grobid_quantities_client
|
| 295 |
|
| 296 |
-
def
|
| 297 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
| 298 |
|
| 299 |
if status != 200:
|
|
@@ -465,7 +459,7 @@ class GrobidMaterialsProcessor(BaseProcessor):
|
|
| 465 |
def __init__(self, grobid_superconductors_client):
|
| 466 |
self.grobid_superconductors_client = grobid_superconductors_client
|
| 467 |
|
| 468 |
-
def
|
| 469 |
preprocessed_text = text.strip()
|
| 470 |
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
| 471 |
"processText_disable_linking")
|
|
@@ -568,17 +562,17 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
|
|
| 568 |
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
| 569 |
|
| 570 |
def process_single_text(self, text):
|
| 571 |
-
extracted_quantities_spans = self.
|
| 572 |
-
extracted_materials_spans = self.
|
| 573 |
all_entities = extracted_quantities_spans + extracted_materials_spans
|
| 574 |
entities = self.prune_overlapping_annotations(all_entities)
|
| 575 |
return entities
|
| 576 |
|
| 577 |
-
def
|
| 578 |
-
return self.gqp.
|
| 579 |
|
| 580 |
-
def
|
| 581 |
-
return self.gmp.
|
| 582 |
|
| 583 |
@staticmethod
|
| 584 |
def box_to_dict(box, color=None, type=None):
|
|
@@ -715,8 +709,8 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
|
|
| 715 |
|
| 716 |
|
| 717 |
class XmlProcessor(BaseProcessor):
|
| 718 |
-
def __init__(self
|
| 719 |
-
super().__init__(
|
| 720 |
|
| 721 |
def process_structure(self, input_file):
|
| 722 |
text = ""
|
|
@@ -728,16 +722,16 @@ class XmlProcessor(BaseProcessor):
|
|
| 728 |
|
| 729 |
return output_data
|
| 730 |
|
| 731 |
-
def process_single(self, input_file):
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
|
| 740 |
-
def
|
| 741 |
output_data = OrderedDict()
|
| 742 |
soup = BeautifulSoup(text, 'xml')
|
| 743 |
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|
|
|
|
| 2 |
from collections import OrderedDict
|
| 3 |
from html import escape
|
| 4 |
from pathlib import Path
|
| 5 |
+
from typing_extensions import deprecated
|
| 6 |
|
| 7 |
import dateparser
|
| 8 |
import grobid_tei_xml
|
|
|
|
| 55 |
return annotated_text
|
| 56 |
|
| 57 |
|
| 58 |
+
@deprecated("Use GrobidQuantitiesProcessor.process() instead")
|
| 59 |
def extract_quantities(client, x_all, column_text_index):
|
| 60 |
# relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
|
| 61 |
# "magnetic flux density", "magnetic flux"]
|
|
|
|
| 65 |
|
| 66 |
for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
|
| 67 |
text = example[column_text_index]
|
| 68 |
+
spans = GrobidQuantitiesProcessor(client).process(text)
|
| 69 |
|
| 70 |
data_record = {
|
| 71 |
"id": example[0],
|
|
|
|
| 80 |
return output_data
|
| 81 |
|
| 82 |
|
| 83 |
+
@deprecated("Use GrobidMaterialsProcessor.process() instead")
|
| 84 |
def extract_materials(client, x_all, column_text_index):
|
| 85 |
output_data = []
|
| 86 |
|
| 87 |
for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
|
| 88 |
text = example[column_text_index]
|
| 89 |
+
spans = GrobidMaterialsProcessor(client).process(text)
|
| 90 |
data_record = {
|
| 91 |
"id": example[0],
|
| 92 |
"filename": example[1],
|
|
|
|
| 134 |
# super().__init__()
|
| 135 |
self.grobid_client = grobid_client
|
| 136 |
|
| 137 |
+
def process(self, input_path, coordinates=False):
|
| 138 |
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
|
| 139 |
input_path,
|
| 140 |
consolidate_header=True,
|
|
|
|
| 148 |
if status != 200:
|
| 149 |
return
|
| 150 |
|
| 151 |
+
document_object = self.parse_grobid_xml(text, coordinates=coordinates)
|
| 152 |
+
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
|
| 153 |
|
| 154 |
+
return document_object
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
def parse_grobid_xml(self, text, coordinates=False):
|
| 157 |
output_data = OrderedDict()
|
|
|
|
| 181 |
"text": f"authors: {biblio['authors']}",
|
| 182 |
"type": passage_type,
|
| 183 |
"section": "<header>",
|
| 184 |
+
"subSection": "<authors>",
|
| 185 |
+
"passage_id": "hauthors",
|
| 186 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
| 187 |
+
blocks_header['authors']])
|
| 188 |
})
|
| 189 |
|
| 190 |
passages.append({
|
|
|
|
| 287 |
def __init__(self, grobid_quantities_client):
|
| 288 |
self.grobid_quantities_client = grobid_quantities_client
|
| 289 |
|
| 290 |
+
def process(self, text):
|
| 291 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
| 292 |
|
| 293 |
if status != 200:
|
|
|
|
| 459 |
def __init__(self, grobid_superconductors_client):
|
| 460 |
self.grobid_superconductors_client = grobid_superconductors_client
|
| 461 |
|
| 462 |
+
def process(self, text):
|
| 463 |
preprocessed_text = text.strip()
|
| 464 |
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
| 465 |
"processText_disable_linking")
|
|
|
|
| 562 |
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
| 563 |
|
| 564 |
def process_single_text(self, text):
|
| 565 |
+
extracted_quantities_spans = self.process_properties(text)
|
| 566 |
+
extracted_materials_spans = self.process_materials(text)
|
| 567 |
all_entities = extracted_quantities_spans + extracted_materials_spans
|
| 568 |
entities = self.prune_overlapping_annotations(all_entities)
|
| 569 |
return entities
|
| 570 |
|
| 571 |
+
def process_properties(self, text):
|
| 572 |
+
return self.gqp.process(text)
|
| 573 |
|
| 574 |
+
def process_materials(self, text):
|
| 575 |
+
return self.gmp.process(text)
|
| 576 |
|
| 577 |
@staticmethod
|
| 578 |
def box_to_dict(box, color=None, type=None):
|
|
|
|
| 709 |
|
| 710 |
|
| 711 |
class XmlProcessor(BaseProcessor):
|
| 712 |
+
def __init__(self):
|
| 713 |
+
super().__init__()
|
| 714 |
|
| 715 |
def process_structure(self, input_file):
|
| 716 |
text = ""
|
|
|
|
| 722 |
|
| 723 |
return output_data
|
| 724 |
|
| 725 |
+
# def process_single(self, input_file):
|
| 726 |
+
# doc = self.process_structure(input_file)
|
| 727 |
+
#
|
| 728 |
+
# for paragraph in doc['passages']:
|
| 729 |
+
# entities = self.process_single_text(paragraph['text'])
|
| 730 |
+
# paragraph['spans'] = entities
|
| 731 |
+
#
|
| 732 |
+
# return doc
|
| 733 |
|
| 734 |
+
def process(self, text):
|
| 735 |
output_data = OrderedDict()
|
| 736 |
soup = BeautifulSoup(text, 'xml')
|
| 737 |
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|