Spaces:
Sleeping
Sleeping
Upload event_detection_dataclean.py
Browse files- event_detection_dataclean.py +118 -0
event_detection_dataclean.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
from collections import Counter
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def load_texttag_file(texttag_filename):
|
| 7 |
+
try:
|
| 8 |
+
with open(texttag_filename, "r") as data_file:
|
| 9 |
+
data_all = data_file.read()
|
| 10 |
+
tags_all = list()
|
| 11 |
+
texts_selected = list()
|
| 12 |
+
tags_selected = list()
|
| 13 |
+
|
| 14 |
+
for line in re.split(r'\n\t?\n', data_all):
|
| 15 |
+
if len(line) != 0:
|
| 16 |
+
texts_line = list()
|
| 17 |
+
tags_line = list()
|
| 18 |
+
for item in line.split("\n"):
|
| 19 |
+
if len(item)!=0:
|
| 20 |
+
text, tag = item.split("\t")
|
| 21 |
+
if re.search(r"[@|?|!+?|:|(|)]|\\|\.*?\|-|/|/|/.*?/|http\S+|www\S+", text) == None:
|
| 22 |
+
texts_line.append(text.lower())
|
| 23 |
+
tags_line.append(tag)
|
| 24 |
+
tags_all.append(tag)
|
| 25 |
+
|
| 26 |
+
texts_selected.append(texts_line)
|
| 27 |
+
tags_selected.append(tags_line)
|
| 28 |
+
except FileNotFoundError as error:
|
| 29 |
+
msg = "Sorry, the file" + data_file + "does not exist."
|
| 30 |
+
print(msg)
|
| 31 |
+
print("error:" + error)
|
| 32 |
+
|
| 33 |
+
return texts_selected, tags_selected, tags_all
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def tag_ids_map(tags_all, tags2ids_name, ids2tags_name):
|
| 37 |
+
tags = list(set(tags_all))
|
| 38 |
+
tags.sort()
|
| 39 |
+
unique_tags = len(tags)
|
| 40 |
+
ids = [i for i in range(unique_tags)]
|
| 41 |
+
|
| 42 |
+
tags2ids = dict(zip(tags, ids))
|
| 43 |
+
ids2tags = dict(zip(ids, tags))
|
| 44 |
+
|
| 45 |
+
with open(tags2ids_name, "w") as filename:
|
| 46 |
+
json.dump(tags2ids, filename)
|
| 47 |
+
|
| 48 |
+
with open(ids2tags_name, "w") as filename:
|
| 49 |
+
json.dump(ids2tags, filename)
|
| 50 |
+
|
| 51 |
+
return tags2ids, ids2tags
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def add_tagids(tags_selected, tags2ids, ids2tags):
|
| 55 |
+
tagids_selected = list()
|
| 56 |
+
for tags_line in tags_selected:
|
| 57 |
+
tagids_line = list()
|
| 58 |
+
for tag in tags_line:
|
| 59 |
+
tagids_line.append(tags2ids[tag])
|
| 60 |
+
tagids_selected.append(tagids_line)
|
| 61 |
+
# print(tagids_selected)
|
| 62 |
+
return tagids_selected
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def add_text_tagid(tags_selected, tags2ids, ids2tags):
|
| 66 |
+
tags_chunk = list()
|
| 67 |
+
tagids_chunk = list()
|
| 68 |
+
for tags_line in tags_selected:
|
| 69 |
+
tag_line_chunk = list()
|
| 70 |
+
tagid_line_chunk = list()
|
| 71 |
+
tag_line_count = Counter(tags_line)
|
| 72 |
+
if len(tag_line_count) == 1:
|
| 73 |
+
tag_line_chunk.append(max(tag_line_count))
|
| 74 |
+
tagid_line_chunk.append(tags2ids[max(tag_line_count)])
|
| 75 |
+
else:
|
| 76 |
+
del tag_line_count["O"]
|
| 77 |
+
tag_line_chunk.append(max(tag_line_count))
|
| 78 |
+
tagid_line_chunk.append(tags2ids[max(tag_line_count)])
|
| 79 |
+
|
| 80 |
+
tags_chunk.append(tag_line_chunk)
|
| 81 |
+
tagids_chunk.append(tagid_line_chunk)
|
| 82 |
+
|
| 83 |
+
return tags_chunk, tagids_chunk
|
| 84 |
+
|
| 85 |
+
def save_json(json_filename, texts_selected, tags_selected, tagids_selected, tags_chunk, tagids_chunk):
|
| 86 |
+
total_length = len(texts_selected)
|
| 87 |
+
save_datalist = list()
|
| 88 |
+
total_length = 32
|
| 89 |
+
for index in range(total_length):
|
| 90 |
+
item_dict = dict()
|
| 91 |
+
item_dict["text"] = texts_selected[index]
|
| 92 |
+
item_dict["word_tag"] = tags_selected[index]
|
| 93 |
+
item_dict["word_tag_id"] = tagids_selected[index]
|
| 94 |
+
item_dict["text_tag"] = tags_chunk[index]
|
| 95 |
+
item_dict["text_tag_id"] = tagids_chunk[index]
|
| 96 |
+
save_datalist.append(item_dict)
|
| 97 |
+
|
| 98 |
+
with open(json_filename, 'w') as file:
|
| 99 |
+
json.dump(save_datalist, file)
|
| 100 |
+
|
| 101 |
+
return
|
| 102 |
+
|
| 103 |
+
def main(data_filename, json_filename, tags2ids_name, ids2tags_name):
|
| 104 |
+
texts_selected, tags_selected, tags_all = load_texttag_file(data_filename)
|
| 105 |
+
tags2ids, ids2tags = tag_ids_map(tags_all, tags2ids_name, ids2tags_name)
|
| 106 |
+
|
| 107 |
+
tagids_selected = add_tagids(tags_selected, tags2ids, ids2tags)
|
| 108 |
+
tags_chunk, tagids_chunk = add_text_tagid(tags_selected, tags2ids, ids2tags)
|
| 109 |
+
|
| 110 |
+
save_json(json_filename, texts_selected, tags_selected, tagids_selected, tags_chunk, tagids_chunk)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
test_raw = "../data/raw_EDT/Event_detection/dev.txt"
|
| 115 |
+
test_save = '../data/raw_EDT/Event_detection/dev.json'
|
| 116 |
+
tags2ids_name = "../data/raw_EDT/Event_detection/tags2ids.json"
|
| 117 |
+
ids2tags_name = "../data/raw_EDT/Event_detection/ids2tags.json"
|
| 118 |
+
main(test_raw, test_save, tags2ids_name, ids2tags_name)
|