|
|
from transformers import pipeline, AutoTokenizer |
|
|
import bz2, json |
|
|
from pprint import pprint |
|
|
|
|
|
MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual-light" |
|
|
|
|
|
|
|
|
ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
|
|
|
ner_pipeline = pipeline( |
|
|
"generic-ner", |
|
|
model=MODEL_NAME, |
|
|
tokenizer=ner_tokenizer, |
|
|
trust_remote_code=True, |
|
|
device="cpu", |
|
|
) |
|
|
|
|
|
def process_archive(lingproc_path): |
|
|
""" |
|
|
Processes paired NER and full-text archives to extract full text and sentence offsets. |
|
|
|
|
|
Args: |
|
|
ner_path (str): Path to the NER .jsonl.bz2 archive. |
|
|
fulltext_path (str): Path to the full-text .jsonl.bz2 archive. |
|
|
|
|
|
Returns: |
|
|
List of tuples: (doc_id, full_text, sentence_offsets) |
|
|
""" |
|
|
results = [] |
|
|
|
|
|
with bz2.open(lingproc_path, mode='rt', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
data = json.loads(line) |
|
|
doc_id = data.get("id") |
|
|
|
|
|
|
|
|
offset_token_map = {} |
|
|
for sent in data.get("sents", []): |
|
|
for token in sent.get("tok", []): |
|
|
offset = token["o"] |
|
|
text = token["t"] |
|
|
offset_token_map[offset] = text |
|
|
|
|
|
|
|
|
full_text_parts = [] |
|
|
sorted_offsets = sorted(offset_token_map.keys()) |
|
|
last_end = 0 |
|
|
for offset in sorted_offsets: |
|
|
token = offset_token_map[offset] |
|
|
if offset > last_end: |
|
|
full_text_parts.append(" " * (offset - last_end)) |
|
|
full_text_parts.append(token) |
|
|
last_end = offset + len(token) |
|
|
full_text = "".join(full_text_parts).strip() |
|
|
|
|
|
|
|
|
|
|
|
sentences = [] |
|
|
for sent in data.get("sents", []): |
|
|
tokens = sent.get("tok", []) |
|
|
if not tokens: |
|
|
continue |
|
|
start = tokens[0]["o"] |
|
|
end = tokens[-1]["o"] + len(tokens[-1]["t"]) |
|
|
newtokens = [{"t": token["t"], "o": token["o"], "l": len(token["t"])} for token in tokens] |
|
|
sentences.append({"start": start, "end": end, "tokens": newtokens}) |
|
|
results.append((doc_id, full_text, sentences)) |
|
|
|
|
|
return results |
|
|
|
|
|
processed_cis = process_archive("../../data/lematin-1885.jsonl.bz2") |
|
|
|
|
|
for ci in processed_cis: |
|
|
doc_id, full_text, offsets = ci |
|
|
print(f"Document ID: {doc_id}") |
|
|
|
|
|
|
|
|
for sentence in offsets: |
|
|
start = sentence["start"] |
|
|
end = sentence["end"] |
|
|
tokens = sentence["tokens"] |
|
|
sentence_text = full_text[start:end] |
|
|
tokens_texts = [full_text[token["o"]:token["o"] + len(token["t"])] for token in tokens] |
|
|
|
|
|
|
|
|
entities = ner_pipeline(sentence_text, tokens=tokens_texts) |
|
|
|
|
|
for entity in entities: |
|
|
abs_start = sentence["start"] + entity["lOffset"] |
|
|
abs_end = sentence["start"] + entity["rOffset"] |
|
|
entity_text = full_text[abs_start:abs_end] |
|
|
entity_surface = entity["surface"] |
|
|
assert entity_text == entity_surface, f"Entity text mismatch: {entity_text} != {entity_surface}" |
|
|
print(f"{doc_id}: {entity_text} -- surface: {entity_surface} -- {entity['type']} -- {abs_start} - {abs_end}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|