Spaces:
Running
Running
fixes data collation issue with padding
Browse files
data.py
CHANGED
|
@@ -167,13 +167,13 @@ class SmolLM3Dataset:
|
|
| 167 |
|
| 168 |
def tokenize_function(examples):
|
| 169 |
"""Tokenize the examples"""
|
| 170 |
-
# Tokenize the texts
|
| 171 |
tokenized = self.tokenizer(
|
| 172 |
examples["text"],
|
| 173 |
truncation=True,
|
| 174 |
-
padding=
|
| 175 |
max_length=self.max_seq_length,
|
| 176 |
-
return_overflowing_tokens=
|
| 177 |
return_length=True,
|
| 178 |
)
|
| 179 |
|
|
@@ -263,6 +263,7 @@ class SmolLM3Dataset:
|
|
| 263 |
mlm=False, # We're doing causal LM, not masked LM
|
| 264 |
pad_to_multiple_of=8, # Pad to multiple of 8 for efficiency
|
| 265 |
return_tensors="pt", # Ensure we return PyTorch tensors
|
|
|
|
| 266 |
)
|
| 267 |
|
| 268 |
def create_sample_dataset(output_path: str = "my_dataset"):
|
|
|
|
| 167 |
|
| 168 |
def tokenize_function(examples):
|
| 169 |
"""Tokenize the examples"""
|
| 170 |
+
# Tokenize the texts with fixed length
|
| 171 |
tokenized = self.tokenizer(
|
| 172 |
examples["text"],
|
| 173 |
truncation=True,
|
| 174 |
+
padding=True, # Enable padding during tokenization
|
| 175 |
max_length=self.max_seq_length,
|
| 176 |
+
return_overflowing_tokens=False, # Don't return overflowing tokens
|
| 177 |
return_length=True,
|
| 178 |
)
|
| 179 |
|
|
|
|
| 263 |
mlm=False, # We're doing causal LM, not masked LM
|
| 264 |
pad_to_multiple_of=8, # Pad to multiple of 8 for efficiency
|
| 265 |
return_tensors="pt", # Ensure we return PyTorch tensors
|
| 266 |
+
padding=True, # Enable padding
|
| 267 |
)
|
| 268 |
|
| 269 |
def create_sample_dataset(output_path: str = "my_dataset"):
|