Spaces:

emilylearning
/

llm_uncertainty

Running

App Files Files Community

llm_uncertainty / winogender_sentences.py

emilylearning

update explanations / comments

a828a08 over 2 years ago

raw

history blame

4.01 kB

	######################################################################
	##
	## This script is a lightly modified version of that provided in winogender-schemas
	## https://github.com/rudinger/winogender-schemas
	##
	######################################################################

	import csv
	import os
	from pathlib import Path
	from collections import OrderedDict

	# This script fully instantiates the 120 templates in ../data/templates.tsv
	# to generate the 720 sentences in ../data/all_sentences.tsv
	# By default this script prints to stdout, and can be run with no arguments:

	def load_templates(path):
	fp = open(path, 'r')
	next(fp) # first line headers
	S = []
	for line in fp:

	line = line.strip().split('\t')
	occupation, other_participant, answer, sentence = line[0], line[1], line[2], line[3]
	S.append((occupation, other_participant, answer, sentence))
	return S

	def generate(occupation, other_participant, sentence, second_ref="", context=None):
	toks = sentence.split(" ")
	occ_index = toks.index("$OCCUPATION")
	part_index = toks.index("$PARTICIPANT")
	toks[occ_index] = occupation
	# we are using the instantiated participant, e.g. "client", "patient", "customer",...
	if not second_ref:
	toks[part_index] = other_participant
	elif second_ref != 'someone':
	toks[part_index] = second_ref
	else:
	# we are using the bleached NP "someone" for the other participant
	# first, remove the token that precedes $PARTICIPANT, i.e. "the"
	toks = toks[:part_index-1]+toks[part_index:]
	# recompute participant index (it should be part_index - 1)
	part_index = toks.index("$PARTICIPANT")
	if part_index == 0:
	toks[part_index] = "Someone"
	else:
	toks[part_index] = "someone"
	NOM = "$NOM_PRONOUN"
	POSS = "$POSS_PRONOUN"
	ACC = "$ACC_PRONOUN"
	special_toks = set({NOM, POSS, ACC})
	mask_map = {NOM: "MASK", POSS: "MASK", ACC: "MASK"}
	mask_toks = [x if not x in special_toks else mask_map[x] for x in toks]
	masked_sent = " ".join(mask_toks)

	return masked_sent
	# %%


	def get_sentences():
	script_dir = os.path.dirname(__file__)
	rel_path = "winogender_schema"
	abs_path = os.path.join(script_dir, rel_path)
	Path(abs_path).mkdir(parents=True, exist_ok=True)
	# %%

	S = load_templates(os.path.join(abs_path, "templates.tsv"))

	# %%
	with open(os.path.join(abs_path, "all_sentences.tsv"), 'w', newline='') as csvfile:
	sentence_writer = csv.writer(csvfile, delimiter='\t')
	sentence_writer.writerow(['sentid', 'sentence'])
	sentence_dict = OrderedDict()

	for s in S:
	occupation, other_participant, answer, sentence = s

	gendered_sentence = generate(
	occupation, other_participant, sentence)
	gendered_sentid = f"{occupation}_{other_participant}_{answer}"
	sentence_dict[gendered_sentid] = gendered_sentence

	someone_sentence = generate(
	occupation, other_participant, sentence, second_ref='someone')
	someone_sentid = f"{occupation}_someone_{answer}"
	sentence_dict[someone_sentid] = someone_sentence

	man_sentence = generate(
	occupation, other_participant, sentence, second_ref='man')
	man_sentid = f"{occupation}_man_{answer}"
	sentence_dict[man_sentid] = man_sentence

	woman_sentence = generate(
	occupation, other_participant, sentence, second_ref='woman')
	woman_sentid = f"{occupation}_woman_{answer}"
	sentence_dict[woman_sentid] = woman_sentence

	sentence_writer.writerow([gendered_sentid, gendered_sentence])
	sentence_writer.writerow([someone_sentid, someone_sentence])
	sentence_writer.writerow([man_sentid, man_sentence])
	sentence_writer.writerow([woman_sentid, woman_sentence])

	return sentence_dict