counting_edit / split_train_test.py
Tevior's picture
Upload from /scratch/ts1v23/workspace/flow_grpo_new/dataset/counting_edit @ 2026-01-11T15:59:22.144113+00:00
fec0cbc verified
import json
import random
# Load data from the JSONL file
data = []
with open('output.jsonl', 'r') as f:
for line in f:
data.append(json.loads(line))
# Shuffle the data using random
random.seed(42)
random.shuffle(data)
# Split into test set (128 samples) and training set (remaining)
test_set = data[:112]
train_set = data[112:]
# Save the test set
with open('test_metadata.jsonl', 'w') as f:
for item in test_set:
json.dump(item, f)
f.write('\n')
# Save the training set
with open('train_metadata.jsonl', 'w') as f:
for item in train_set:
json.dump(item, f)
f.write('\n')
print(f"Test set size: {len(test_set)}")
print(f"Training set size: {len(train_set)}")