Spaces:
Running
Running
adds datasets splits
Browse files
data.py
CHANGED
|
@@ -74,6 +74,17 @@ class SmolLM3Dataset:
|
|
| 74 |
try:
|
| 75 |
dataset = load_dataset(self.data_path)
|
| 76 |
logger.info(f"Loaded Hugging Face dataset: {self.data_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
return dataset
|
| 78 |
except Exception as e:
|
| 79 |
logger.error(f"Failed to load dataset: {e}")
|
|
|
|
| 74 |
try:
|
| 75 |
dataset = load_dataset(self.data_path)
|
| 76 |
logger.info(f"Loaded Hugging Face dataset: {self.data_path}")
|
| 77 |
+
# If only 'train' split exists, create validation and test splits
|
| 78 |
+
if ("train" in dataset) and ("validation" not in dataset or "test" not in dataset):
|
| 79 |
+
logger.info("Automatically splitting train into train/validation/test (98/1/1)")
|
| 80 |
+
split_dataset = dataset["train"].train_test_split(test_size=0.02, seed=42)
|
| 81 |
+
# Now split test into validation and test (1% each)
|
| 82 |
+
val_test_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)
|
| 83 |
+
dataset = {
|
| 84 |
+
"train": split_dataset["train"],
|
| 85 |
+
"validation": val_test_split["train"],
|
| 86 |
+
"test": val_test_split["test"]
|
| 87 |
+
}
|
| 88 |
return dataset
|
| 89 |
except Exception as e:
|
| 90 |
logger.error(f"Failed to load dataset: {e}")
|