Spaces:
Running
Running
fix dependencies
Browse files- grobid_processors.py +19 -4
grobid_processors.py
CHANGED
|
@@ -8,8 +8,6 @@ import grobid_tei_xml
|
|
| 8 |
from bs4 import BeautifulSoup
|
| 9 |
from tqdm import tqdm
|
| 10 |
|
| 11 |
-
from commons import supermat_tei_parser
|
| 12 |
-
|
| 13 |
|
| 14 |
def get_span_start(type, title=None):
|
| 15 |
title_ = ' title="' + title + '"' if title is not None else ""
|
|
@@ -659,7 +657,7 @@ class XmlProcessor(BaseProcessor):
|
|
| 659 |
def parse_xml(self, text):
|
| 660 |
output_data = OrderedDict()
|
| 661 |
soup = BeautifulSoup(text, 'xml')
|
| 662 |
-
text_blocks_children =
|
| 663 |
|
| 664 |
passages = []
|
| 665 |
output_data['passages'] = passages
|
|
@@ -680,8 +678,25 @@ class XmlProcessor(BaseProcessor):
|
|
| 680 |
|
| 681 |
return output_data
|
| 682 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
|
| 684 |
-
def
|
| 685 |
children = []
|
| 686 |
|
| 687 |
child_name = "p" if use_paragraphs else "s"
|
|
|
|
| 8 |
from bs4 import BeautifulSoup
|
| 9 |
from tqdm import tqdm
|
| 10 |
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def get_span_start(type, title=None):
|
| 13 |
title_ = ' title="' + title + '"' if title is not None else ""
|
|
|
|
| 657 |
def parse_xml(self, text):
|
| 658 |
output_data = OrderedDict()
|
| 659 |
soup = BeautifulSoup(text, 'xml')
|
| 660 |
+
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|
| 661 |
|
| 662 |
passages = []
|
| 663 |
output_data['passages'] = passages
|
|
|
|
| 678 |
|
| 679 |
return output_data
|
| 680 |
|
| 681 |
+
def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
|
| 682 |
+
children = []
|
| 683 |
+
|
| 684 |
+
child_name = "p" if use_paragraphs else "s"
|
| 685 |
+
for child in soup.tei.children:
|
| 686 |
+
if child.name == 'teiHeader':
|
| 687 |
+
pass
|
| 688 |
+
children.append(child.find_all("title"))
|
| 689 |
+
children.extend([subchild.find_all(child_name) for subchild in child.find_all("abstract")])
|
| 690 |
+
children.extend([subchild.find_all(child_name) for subchild in child.find_all("ab", {"type": "keywords"})])
|
| 691 |
+
elif child.name == 'text':
|
| 692 |
+
children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
|
| 693 |
+
|
| 694 |
+
if verbose:
|
| 695 |
+
print(str(children))
|
| 696 |
+
|
| 697 |
+
return children
|
| 698 |
|
| 699 |
+
def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
|
| 700 |
children = []
|
| 701 |
|
| 702 |
child_name = "p" if use_paragraphs else "s"
|