Spaces:
Running
Running
| """Tokenization classes for vibevoice.""" | |
| from typing import List, Optional, Union | |
| from transformers.utils import logging | |
| from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer | |
| from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast | |
| logger = logging.get_logger(__name__) | |
| class VibeVoiceTextTokenizer(Qwen2Tokenizer): | |
| """ | |
| Construct a VibeVoice tokenizer. Based on the Qwen2 tokenizer with additional special tokens for speech. | |
| Args: | |
| vocab_file (`str`): | |
| Path to the vocabulary file. | |
| merges_file (`str`): | |
| Path to the merges file. | |
| errors (`str`, *optional*, defaults to `"replace"`): | |
| Paradigm to follow when decoding bytes to UTF-8. | |
| unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): | |
| The unknown token. | |
| bos_token (`str`, *optional*): | |
| The beginning of sequence token. Not used for vibevoice. | |
| eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): | |
| The end of sequence token. | |
| pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): | |
| The token used for padding. | |
| add_special_tokens (`bool`, *optional*, defaults to `True`): | |
| Whether or not to add special tokens when encoding. | |
| """ | |
| model_input_names = ["input_ids", "attention_mask"] | |
| def __init__( | |
| self, | |
| vocab_file, | |
| merges_file, | |
| errors="replace", | |
| unk_token="<|endoftext|>", | |
| bos_token=None, | |
| eos_token="<|endoftext|>", | |
| pad_token="<|endoftext|>", | |
| add_prefix_space=False, | |
| add_special_tokens=True, | |
| **kwargs, | |
| ): | |
| super().__init__( | |
| vocab_file=vocab_file, | |
| merges_file=merges_file, | |
| errors=errors, | |
| unk_token=unk_token, | |
| bos_token=bos_token, | |
| eos_token=eos_token, | |
| pad_token=pad_token, | |
| add_prefix_space=add_prefix_space, | |
| add_special_tokens=add_special_tokens, | |
| **kwargs, | |
| ) | |
| # Add VibeVoice-specific special tokens | |
| self._add_vibevoice_special_tokens() | |
| def _add_vibevoice_special_tokens(self): | |
| """Add VibeVoice-specific special tokens.""" | |
| special_tokens = { | |
| "additional_special_tokens": [ | |
| "<|vision_start|>", # Speech start (reusing vision tokens) | |
| "<|vision_end|>", # Speech end | |
| "<|vision_pad|>", # Speech diffusion pad | |
| ] | |
| } | |
| num_added = self.add_special_tokens(special_tokens) | |
| # Cache special token IDs | |
| self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>") | |
| self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>") | |
| self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>") | |
| self._eos_id = self.convert_tokens_to_ids('<|endoftext|>') | |
| return num_added | |
| def eos_id(self) -> int: | |
| """Id of the end of sequence token.""" | |
| return self._eos_id | |
| def speech_start_id(self) -> int: | |
| """Id of the speech start token.""" | |
| return self._speech_start_id | |
| def speech_end_id(self) -> int: | |
| """Id of the speech end token.""" | |
| return self._speech_end_id | |
| def speech_diffusion_id(self) -> int: | |
| """Id of the speech diffusion token.""" | |
| return self._speech_diffusion_id | |
| def pad_id(self) -> int: | |
| """Id used for padding (returns -100 for loss masking).""" | |
| return -100 | |
| class VibeVoiceTextTokenizerFast(Qwen2TokenizerFast): | |
| """ | |
| Construct a "fast" VibeVoice tokenizer (backed by HuggingFace's *tokenizers* library). | |
| Based on the Qwen2 tokenizer with additional special tokens for speech. | |
| Args: | |
| vocab_file (`str`, *optional*): | |
| Path to the vocabulary file. | |
| merges_file (`str`, *optional*): | |
| Path to the merges file. | |
| tokenizer_file (`str`, *optional*): | |
| Path to [tokenizers](https://github.com/huggingface/tokenizers) file. | |
| unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): | |
| The unknown token. | |
| bos_token (`str`, *optional*): | |
| The beginning of sequence token. Not used for vibevoice. | |
| eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): | |
| The end of sequence token. | |
| pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): | |
| The token used for padding. | |
| """ | |
| model_input_names = ["input_ids", "attention_mask"] | |
| def __init__( | |
| self, | |
| vocab_file=None, | |
| merges_file=None, | |
| tokenizer_file=None, | |
| unk_token="<|endoftext|>", | |
| bos_token=None, | |
| eos_token="<|endoftext|>", | |
| pad_token="<|endoftext|>", | |
| add_prefix_space=False, | |
| **kwargs, | |
| ): | |
| super().__init__( | |
| vocab_file=vocab_file, | |
| merges_file=merges_file, | |
| tokenizer_file=tokenizer_file, | |
| unk_token=unk_token, | |
| bos_token=bos_token, | |
| eos_token=eos_token, | |
| pad_token=pad_token, | |
| add_prefix_space=add_prefix_space, | |
| **kwargs, | |
| ) | |
| # Add VibeVoice-specific special tokens | |
| self._add_vibevoice_special_tokens() | |
| def _add_vibevoice_special_tokens(self): | |
| """Add VibeVoice-specific special tokens.""" | |
| special_tokens = { | |
| "additional_special_tokens": [ | |
| "<|vision_start|>", # Speech start (reusing vision tokens) | |
| "<|vision_end|>", # Speech end | |
| "<|vision_pad|>", # Speech diffusion pad | |
| ] | |
| } | |
| num_added = self.add_special_tokens(special_tokens) | |
| # Cache special token IDs | |
| self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>") | |
| self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>") | |
| self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>") | |
| # self._eos_id = self.convert_tokens_to_ids('<|endoftext|>') | |
| self._eos_id = self.eos_token_id # qwen2 / qwen3 | |
| self._pad_id = self.convert_tokens_to_ids('<|image_pad|>') | |
| return num_added | |
| def eos_id(self) -> int: | |
| """Id of the end of sequence token.""" | |
| return self._eos_id | |
| def speech_start_id(self) -> int: | |
| """Id of the speech start token.""" | |
| return self._speech_start_id | |
| def speech_end_id(self) -> int: | |
| """Id of the speech end token.""" | |
| return self._speech_end_id | |
| def speech_diffusion_id(self) -> int: | |
| """Id of the speech diffusion token.""" | |
| return self._speech_diffusion_id | |
| def pad_id(self) -> int: | |
| """Id used for padding (returns -100 for loss masking).""" | |
| return self._pad_id | |
| __all__ = [ | |
| "VibeVoiceTextTokenizer", | |
| "VibeVoiceTextTokenizerFast", | |
| ] |