Spaces:
Running
Running
decouple quantities and superconductors
Browse files- document_qa/grobid_processors.py +17 -53
document_qa/grobid_processors.py
CHANGED
|
@@ -7,7 +7,6 @@ import dateparser
|
|
| 7 |
import grobid_tei_xml
|
| 8 |
from bs4 import BeautifulSoup
|
| 9 |
from grobid_client.grobid_client import GrobidClient
|
| 10 |
-
from tqdm import tqdm
|
| 11 |
|
| 12 |
|
| 13 |
def get_span_start(type, title=None):
|
|
@@ -55,49 +54,6 @@ def decorate_text_with_annotations(text, spans, tag="span"):
|
|
| 55 |
return annotated_text
|
| 56 |
|
| 57 |
|
| 58 |
-
def extract_quantities(client, x_all, column_text_index):
|
| 59 |
-
# relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
|
| 60 |
-
# "magnetic flux density", "magnetic flux"]
|
| 61 |
-
# property_keywords = ['coercivity', 'remanence']
|
| 62 |
-
|
| 63 |
-
output_data = []
|
| 64 |
-
|
| 65 |
-
for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
|
| 66 |
-
text = example[column_text_index]
|
| 67 |
-
spans = GrobidQuantitiesProcessor(client).extract_quantities(text)
|
| 68 |
-
|
| 69 |
-
data_record = {
|
| 70 |
-
"id": example[0],
|
| 71 |
-
"filename": example[1],
|
| 72 |
-
"passage_id": example[2],
|
| 73 |
-
"text": text,
|
| 74 |
-
"spans": spans
|
| 75 |
-
}
|
| 76 |
-
|
| 77 |
-
output_data.append(data_record)
|
| 78 |
-
|
| 79 |
-
return output_data
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
def extract_materials(client, x_all, column_text_index):
|
| 83 |
-
output_data = []
|
| 84 |
-
|
| 85 |
-
for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
|
| 86 |
-
text = example[column_text_index]
|
| 87 |
-
spans = GrobidMaterialsProcessor(client).extract_materials(text)
|
| 88 |
-
data_record = {
|
| 89 |
-
"id": example[0],
|
| 90 |
-
"filename": example[1],
|
| 91 |
-
"passage_id": example[2],
|
| 92 |
-
"text": text,
|
| 93 |
-
"spans": spans
|
| 94 |
-
}
|
| 95 |
-
|
| 96 |
-
output_data.append(data_record)
|
| 97 |
-
|
| 98 |
-
return output_data
|
| 99 |
-
|
| 100 |
-
|
| 101 |
def get_parsed_value_type(quantity):
|
| 102 |
if 'parsedValue' in quantity and 'structure' in quantity['parsedValue']:
|
| 103 |
return quantity['parsedValue']['structure']['type']
|
|
@@ -199,7 +155,7 @@ class GrobidProcessor(BaseProcessor):
|
|
| 199 |
"subSection": "<title>",
|
| 200 |
"passage_id": "htitle",
|
| 201 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
| 202 |
-
|
| 203 |
})
|
| 204 |
|
| 205 |
passages.append({
|
|
@@ -302,7 +258,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
|
|
| 302 |
def __init__(self, grobid_quantities_client):
|
| 303 |
self.grobid_quantities_client = grobid_quantities_client
|
| 304 |
|
| 305 |
-
def extract_quantities(self, text):
|
| 306 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
| 307 |
|
| 308 |
if status != 200:
|
|
@@ -570,11 +526,12 @@ class GrobidMaterialsProcessor(BaseProcessor):
|
|
| 570 |
return materials
|
| 571 |
|
| 572 |
|
| 573 |
-
class GrobidAggregationProcessor(
|
| 574 |
-
def __init__(self,
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
|
|
|
| 578 |
|
| 579 |
def process_single_text(self, text):
|
| 580 |
extracted_quantities_spans = self.gqp.extract_quantities(text)
|
|
@@ -584,10 +541,17 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
|
|
| 584 |
return entities
|
| 585 |
|
| 586 |
def extract_quantities(self, text):
|
| 587 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 588 |
|
| 589 |
def extract_materials(self, text):
|
| 590 |
-
|
|
|
|
|
|
|
|
|
|
| 591 |
|
| 592 |
@staticmethod
|
| 593 |
def box_to_dict(box, color=None, type=None):
|
|
|
|
| 7 |
import grobid_tei_xml
|
| 8 |
from bs4 import BeautifulSoup
|
| 9 |
from grobid_client.grobid_client import GrobidClient
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def get_span_start(type, title=None):
|
|
|
|
| 54 |
return annotated_text
|
| 55 |
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def get_parsed_value_type(quantity):
|
| 58 |
if 'parsedValue' in quantity and 'structure' in quantity['parsedValue']:
|
| 59 |
return quantity['parsedValue']['structure']['type']
|
|
|
|
| 155 |
"subSection": "<title>",
|
| 156 |
"passage_id": "htitle",
|
| 157 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
| 158 |
+
blocks_header['authors']])
|
| 159 |
})
|
| 160 |
|
| 161 |
passages.append({
|
|
|
|
| 258 |
def __init__(self, grobid_quantities_client):
|
| 259 |
self.grobid_quantities_client = grobid_quantities_client
|
| 260 |
|
| 261 |
+
def extract_quantities(self, text) -> list:
|
| 262 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
| 263 |
|
| 264 |
if status != 200:
|
|
|
|
| 526 |
return materials
|
| 527 |
|
| 528 |
|
| 529 |
+
class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProcessor):
|
| 530 |
+
def __init__(self, grobid_quantities_client=None, grobid_superconductors_client=None):
|
| 531 |
+
if grobid_quantities_client:
|
| 532 |
+
self.gqp = GrobidQuantitiesProcessor(grobid_quantities_client)
|
| 533 |
+
if grobid_superconductors_client:
|
| 534 |
+
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
| 535 |
|
| 536 |
def process_single_text(self, text):
|
| 537 |
extracted_quantities_spans = self.gqp.extract_quantities(text)
|
|
|
|
| 541 |
return entities
|
| 542 |
|
| 543 |
def extract_quantities(self, text):
|
| 544 |
+
if self.gqp:
|
| 545 |
+
return self.gqp.extract_quantities(text)
|
| 546 |
+
else:
|
| 547 |
+
return []
|
| 548 |
+
|
| 549 |
|
| 550 |
def extract_materials(self, text):
|
| 551 |
+
if self.gmp:
|
| 552 |
+
return self.gmp.extract_materials(text)
|
| 553 |
+
else:
|
| 554 |
+
return []
|
| 555 |
|
| 556 |
@staticmethod
|
| 557 |
def box_to_dict(box, color=None, type=None):
|