Spaces:

CadenShokat
/

modernbert-finetuned-embeddings

Paused

Caden Shokat

init commit

9cab4b9 3 months ago

781 Bytes

	from datasets import load_dataset
	from src.utils.paths import TRAIN_JSON, TEST_JSON
	from src.utils.seed import set_seed

	REMOVE_COLS = ["chunk_id", "doc_id", "question_id", "answer_span"]

	def main():
	set_seed(42)

	ds = load_dataset("CadenShokat/aws-rag-qa-positives", split="train")

	ds = ds.rename_column("question", "anchor")
	ds = ds.rename_column("chunk", "positive")
	ds = ds.remove_columns(REMOVE_COLS)

	ds = ds.add_column("id", list(range(len(ds))))

	ds = ds.shuffle(seed=42)
	split = ds.train_test_split(test_size=0.1, seed=42)

	split["train"].to_json(str(TRAIN_JSON), orient="records")
	split["test"].to_json(str(TEST_JSON), orient="records")

	print(f"Wrote:\n- {TRAIN_JSON}\n- {TEST_JSON}")

	if __name__ == "__main__":
	main()