Spaces:
Running
Running
| ###################################################################### | |
| ## | |
| ## This script is a lightly modified version of that provided in winogender-schemas | |
| ## https://github.com/rudinger/winogender-schemas | |
| ## | |
| ###################################################################### | |
| import csv | |
| import os | |
| from pathlib import Path | |
| from collections import OrderedDict | |
| # This script fully instantiates the 120 templates in ../data/templates.tsv | |
| # to generate the 720 sentences in ../data/all_sentences.tsv | |
| # By default this script prints to stdout, and can be run with no arguments: | |
| def load_templates(path): | |
| fp = open(path, 'r') | |
| next(fp) # first line headers | |
| S = [] | |
| for line in fp: | |
| line = line.strip().split('\t') | |
| occupation, other_participant, answer, sentence = line[0], line[1], line[2], line[3] | |
| S.append((occupation, other_participant, answer, sentence)) | |
| return S | |
| def generate(occupation, other_participant, sentence, second_ref="", context=None): | |
| toks = sentence.split(" ") | |
| occ_index = toks.index("$OCCUPATION") | |
| part_index = toks.index("$PARTICIPANT") | |
| toks[occ_index] = occupation | |
| # we are using the instantiated participant, e.g. "client", "patient", "customer",... | |
| if not second_ref: | |
| toks[part_index] = other_participant | |
| elif second_ref != 'someone': | |
| toks[part_index] = second_ref | |
| else: | |
| # we are using the bleached NP "someone" for the other participant | |
| # first, remove the token that precedes $PARTICIPANT, i.e. "the" | |
| toks = toks[:part_index-1]+toks[part_index:] | |
| # recompute participant index (it should be part_index - 1) | |
| part_index = toks.index("$PARTICIPANT") | |
| if part_index == 0: | |
| toks[part_index] = "Someone" | |
| else: | |
| toks[part_index] = "someone" | |
| NOM = "$NOM_PRONOUN" | |
| POSS = "$POSS_PRONOUN" | |
| ACC = "$ACC_PRONOUN" | |
| special_toks = set({NOM, POSS, ACC}) | |
| mask_map = {NOM: "MASK", POSS: "MASK", ACC: "MASK"} | |
| mask_toks = [x if not x in special_toks else mask_map[x] for x in toks] | |
| masked_sent = " ".join(mask_toks) | |
| return masked_sent | |
| # %% | |
| def get_sentences(): | |
| script_dir = os.path.dirname(__file__) | |
| rel_path = "winogender_schema" | |
| abs_path = os.path.join(script_dir, rel_path) | |
| Path(abs_path).mkdir(parents=True, exist_ok=True) | |
| # %% | |
| S = load_templates(os.path.join(abs_path, "templates.tsv")) | |
| # %% | |
| with open(os.path.join(abs_path, "all_sentences.tsv"), 'w', newline='') as csvfile: | |
| sentence_writer = csv.writer(csvfile, delimiter='\t') | |
| sentence_writer.writerow(['sentid', 'sentence']) | |
| sentence_dict = OrderedDict() | |
| for s in S: | |
| occupation, other_participant, answer, sentence = s | |
| gendered_sentence = generate( | |
| occupation, other_participant, sentence) | |
| gendered_sentid = f"{occupation}_{other_participant}_{answer}" | |
| sentence_dict[gendered_sentid] = gendered_sentence | |
| someone_sentence = generate( | |
| occupation, other_participant, sentence, second_ref='someone') | |
| someone_sentid = f"{occupation}_someone_{answer}" | |
| sentence_dict[someone_sentid] = someone_sentence | |
| man_sentence = generate( | |
| occupation, other_participant, sentence, second_ref='man') | |
| man_sentid = f"{occupation}_man_{answer}" | |
| sentence_dict[man_sentid] = man_sentence | |
| woman_sentence = generate( | |
| occupation, other_participant, sentence, second_ref='woman') | |
| woman_sentid = f"{occupation}_woman_{answer}" | |
| sentence_dict[woman_sentid] = woman_sentence | |
| sentence_writer.writerow([gendered_sentid, gendered_sentence]) | |
| sentence_writer.writerow([someone_sentid, someone_sentence]) | |
| sentence_writer.writerow([man_sentid, man_sentence]) | |
| sentence_writer.writerow([woman_sentid, woman_sentence]) | |
| return sentence_dict | |