File size: 3,981 Bytes
c36a6f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from transformers import pipeline, AutoTokenizer
import bz2, json
from pprint import pprint

MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual-light"

# Load the tokenizer and model using the pipeline
ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

ner_pipeline = pipeline(
    "generic-ner",
    model=MODEL_NAME,
    tokenizer=ner_tokenizer,
    trust_remote_code=True,
    device="cpu",
)

def process_archive(lingproc_path):
    """
    Processes paired NER and full-text archives to extract full text and sentence offsets.

    Args:
        ner_path (str): Path to the NER .jsonl.bz2 archive.
        fulltext_path (str): Path to the full-text .jsonl.bz2 archive.

    Returns:
        List of tuples: (doc_id, full_text, sentence_offsets)
    """
    results = []

    with bz2.open(lingproc_path, mode='rt', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            doc_id = data.get("id")

            # Reconstruct the full text from all tokens using their offsets
            offset_token_map = {}
            for sent in data.get("sents", []):
                for token in sent.get("tok", []):
                    offset = token["o"]
                    text = token["t"]
                    offset_token_map[offset] = text

            # Rebuild full text from sorted offsets
            full_text_parts = []
            sorted_offsets = sorted(offset_token_map.keys())
            last_end = 0
            for offset in sorted_offsets:
                token = offset_token_map[offset]
                if offset > last_end:
                    full_text_parts.append(" " * (offset - last_end))
                full_text_parts.append(token)
                last_end = offset + len(token)
            full_text = "".join(full_text_parts).strip()

            # assert new_full_text == full_text, f"Full text mismatch for doc_id {doc_id}. Expected: {full_text}, Got: {new_full_text}"

            sentences = []
            for sent in data.get("sents", []):
                tokens = sent.get("tok", [])
                if not tokens:
                    continue
                start = tokens[0]["o"]
                end = tokens[-1]["o"] + len(tokens[-1]["t"])
                newtokens = [{"t": token["t"], "o": token["o"], "l": len(token["t"])} for token in tokens]
                sentences.append({"start": start, "end": end, "tokens": newtokens})
            results.append((doc_id, full_text, sentences))

    return results

processed_cis = process_archive("../../data/lematin-1885.jsonl.bz2")

for ci in processed_cis:
    doc_id, full_text, offsets = ci
    print(f"Document ID: {doc_id}")
    # print(f"Full Text: {full_text}")
    # print("Sentences:")
    for sentence in offsets:
        start = sentence["start"]
        end = sentence["end"]
        tokens = sentence["tokens"]
        sentence_text = full_text[start:end]
        tokens_texts = [full_text[token["o"]:token["o"] + len(token["t"])] for token in tokens]
        # print(sentence_text)

        entities = ner_pipeline(sentence_text, tokens=tokens_texts)

        for entity in entities:
            abs_start = sentence["start"] + entity["lOffset"]
            abs_end = sentence["start"] + entity["rOffset"]
            entity_text = full_text[abs_start:abs_end]
            entity_surface = entity["surface"]
            assert entity_text == entity_surface, f"Entity text mismatch: {entity_text} != {entity_surface}"
            print(f"{doc_id}: {entity_text} -- surface: {entity_surface} -- {entity['type']} -- {abs_start} - {abs_end}")
        # pprint(entities)

        # print(f"  Sentence: {sentence_text} (Start: {start}, End: {end})")
        # for token in tokens:
        #     token_text = token["t"]
        #     token_offset = token["o"]
        #     token_label = token["l"]
        #     print(f"    Token: {token_text} (Offset: {token_offset}, Label: {token_label})")


# entities = ner_pipeline(sentence)