emanuelaboros's picture
new model
c36a6f3
from transformers import pipeline, AutoTokenizer
import bz2, json
from pprint import pprint
MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual-light"
# Load the tokenizer and model using the pipeline
ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline(
"generic-ner",
model=MODEL_NAME,
tokenizer=ner_tokenizer,
trust_remote_code=True,
device="cpu",
)
def process_archive(lingproc_path):
"""
Processes paired NER and full-text archives to extract full text and sentence offsets.
Args:
ner_path (str): Path to the NER .jsonl.bz2 archive.
fulltext_path (str): Path to the full-text .jsonl.bz2 archive.
Returns:
List of tuples: (doc_id, full_text, sentence_offsets)
"""
results = []
with bz2.open(lingproc_path, mode='rt', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
doc_id = data.get("id")
# Reconstruct the full text from all tokens using their offsets
offset_token_map = {}
for sent in data.get("sents", []):
for token in sent.get("tok", []):
offset = token["o"]
text = token["t"]
offset_token_map[offset] = text
# Rebuild full text from sorted offsets
full_text_parts = []
sorted_offsets = sorted(offset_token_map.keys())
last_end = 0
for offset in sorted_offsets:
token = offset_token_map[offset]
if offset > last_end:
full_text_parts.append(" " * (offset - last_end))
full_text_parts.append(token)
last_end = offset + len(token)
full_text = "".join(full_text_parts).strip()
# assert new_full_text == full_text, f"Full text mismatch for doc_id {doc_id}. Expected: {full_text}, Got: {new_full_text}"
sentences = []
for sent in data.get("sents", []):
tokens = sent.get("tok", [])
if not tokens:
continue
start = tokens[0]["o"]
end = tokens[-1]["o"] + len(tokens[-1]["t"])
newtokens = [{"t": token["t"], "o": token["o"], "l": len(token["t"])} for token in tokens]
sentences.append({"start": start, "end": end, "tokens": newtokens})
results.append((doc_id, full_text, sentences))
return results
processed_cis = process_archive("../../data/lematin-1885.jsonl.bz2")
for ci in processed_cis:
doc_id, full_text, offsets = ci
print(f"Document ID: {doc_id}")
# print(f"Full Text: {full_text}")
# print("Sentences:")
for sentence in offsets:
start = sentence["start"]
end = sentence["end"]
tokens = sentence["tokens"]
sentence_text = full_text[start:end]
tokens_texts = [full_text[token["o"]:token["o"] + len(token["t"])] for token in tokens]
# print(sentence_text)
entities = ner_pipeline(sentence_text, tokens=tokens_texts)
for entity in entities:
abs_start = sentence["start"] + entity["lOffset"]
abs_end = sentence["start"] + entity["rOffset"]
entity_text = full_text[abs_start:abs_end]
entity_surface = entity["surface"]
assert entity_text == entity_surface, f"Entity text mismatch: {entity_text} != {entity_surface}"
print(f"{doc_id}: {entity_text} -- surface: {entity_surface} -- {entity['type']} -- {abs_start} - {abs_end}")
# pprint(entities)
# print(f" Sentence: {sentence_text} (Start: {start}, End: {end})")
# for token in tokens:
# token_text = token["t"]
# token_offset = token["o"]
# token_label = token["l"]
# print(f" Token: {token_text} (Offset: {token_offset}, Label: {token_label})")
# entities = ner_pipeline(sentence)