Possible bug: tokenizer.vocab_size is misconfigured (Screws up embedding input_dim)

#38
by david-thrower - opened

TLDR:

  • The tokenizer for HuggingFaceTB/SmolLM3-3B is configured to return a vocab_size of 128,000
  • It is actually 128256.
  • Using the tokenizer.vocab_size to set embedding input_dim with this misconfiguration screws up the embeddings and causes experiments to fail.

How I stumbled on this:

My code:


inp = tf.keras.layers.Input(shape=(), dtype=tf.string)



@tf
	.keras.utils.register_keras_serializable()
class NewTokenizerLayer(tf.keras.layers.Layer):
    def __init__(self, max_seq_length, tokenizer_checkpoint, **kwargs):
        super().__init__(**kwargs)
        self.max_seq_length = max_seq_length
        self.tokenizer_checkpoint = tokenizer_checkpoint
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
        
        # Ensure tokenizer has a padding token
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def call(self, inputs):
        def tokenize_py_fn(inputs):
            # Convert TensorFlow bytes to Python strings
            texts = [text.decode('utf-8') for text in inputs.numpy()]
            
            # Tokenize with Hugging Face tokenizer
            tokenized = self.tokenizer(
                texts,
                max_length=self.max_seq_length,
                padding='max_length',
                truncation=True,
                return_tensors='tf'
            )
            return tokenized['input_ids'].numpy()
        
        # Wrap Python function in TensorFlow operation
        input_ids = tf.py_function(
            tokenize_py_fn,
            [inputs],
            Tout=tf.int32
        )
        
        # Set shape for downstream layers
        batch_size = tf.shape(inputs)[0]
        input_ids.set_shape([None, self.max_seq_length])
        
        return input_ids

    def get_config(self):
        config = super().get_config()
        config.update({
            'max_seq_length': self.max_seq_length,
            'tokenizer_checkpoint': self.tokenizer_checkpoint
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(
            max_seq_length=config['max_seq_length'],
            tokenizer_checkpoint=config['tokenizer_checkpoint']
        )


max_seq_length = 1536
tokenizer_checkpoint = "HuggingFaceTB/SmolLM3-3B"

# Subclass of tf.keras.layer that tokenizes with this tokenizer (GPT2 is a misnomer ...  Forgive the arcane nomenclature / ghost of versions past ...)
gp2_tokenizer = NewTokenizerLayer(max_seq_length=max_seq_length,tokenizer_checkpoint=tokenizer_checkpoint)
VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocab_size 
# ^ This returns 120000 when 128256 is the factual and correct value ...
# This screws up my embedding layers 

tokens = gp2_tokenizer(inp)

# On larger hardware, this could probably be increased considerably and
# Probably would improve performance ...
EMBEDDING_N = 12  # Define EMBEDDING_DIM here, to match your embedding layer.
EMBEDDING_DIM = int(EMBEDDING_N * 2)

embedded = tf.keras.layers.Embedding(
    input_dim=VOCABULARY_SIZE,  # <-----------<<< This parmameter is now incorrect and will make training fail out.
    output_dim=EMBEDDING_DIM,
    input_length=max_seq_length,
    mask_zero=True)(tokens)

This code throws this Exception:


indices[15,1440] = 128012 is not in [0, 128000)   ### <-------------------------------------<<<<<<<<
...
[Op:__inference_multi_step_on_iterator_91546]

Token of 128012, which is > 128000, hence it is out of range for an embedding of input_dim=128000

...

I verify this from my terminal:


$ tokenizer_checkpoint = "HuggingFaceTB/SmolLM3-3B"
$ from transformers import AutoTokenizer
$ tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
$ tokenizer.vocab_size
128000
$ len(tokenizer)
128256

Workaround for anyone else that runs into the same problem:

tokenizer_checkpoint = "HuggingFaceTB/SmolLM3-3B"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
VOCABULARY_SIZE = len(tokenizer) # NOT tokenizer.vocab_size

embedded = tf.keras.layers.Embedding(
    input_dim=VOCABULARY_SIZE,  # <-----------<<< This parmameter is now incorrect and will make training fail out.
    # ...
)(tokens)

Sign up or log in to comment