p1atdev
/

dart-v1-base

@@ -1,26 +1,51 @@
 import logging
-import os
 import json
-from typing import Optional, Dict, List, Tuple, Union
 from pydantic.dataclasses import dataclass
-import numpy as np
-from numpy.typing import NDArray
 from transformers import PreTrainedTokenizerFast
 from tokenizers.decoders import Decoder
 logger = logging.getLogger(__name__)
-VOCAB_FILES_NAMES = {
-    "category_config": "category_config.json",
-}
-PRETRAINED_VOCAB_FILES_MAP = {
-    "category_config": {
-        "p1atdev/dart-tokenizer-v1": "https://huggingface.co/p1atdev/dart-tokenizer-v1/resolve/main/tag_category.json"
-    }
-}
 @dataclass
@@ -71,57 +96,17 @@ class DartDecoder:
 class DartTokenizer(PreTrainedTokenizerFast):
     """Dart tokenizer"""
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    def __init__(self, category_config, **kwargs):
         super().__init__(**kwargs)
         self._tokenizer.decoder = Decoder.custom(  # type: ignore
             DartDecoder(list(self.get_added_vocab().keys()))
         )
-        self.category_config = load_tag_category_config(category_config)
-        self._id_to_category_map = np.zeros(self.vocab_size).astype("uint8")
-        for (
-            category_id,
-            tokens,
-        ) in self.category_config.category_to_token_ids.items():
-            self._id_to_category_map[tokens] = int(category_id)
-    def create_vocab_mask(self, value: int = 1):
-        """Create an array of vocab size filled with specified value"""
-        return np.full(self.vocab_size, value).astype("uint8")
-    def get_token_ids_in_category(self, category_id: Union[int, str]):
-        """Get token ids in the specified category"""
-        return self.category_config.category_to_token_ids[str(category_id)]
-    def get_category(self, category_id: Union[int, str]):
-        """Get the specified category config"""
-        return self.category_config.categories[str(category_id)]
-    def convert_ids_to_category_ids(self, token_ids: Union[int, List[int]]):
-        """Get the category ids of specified tokens"""
-        return self._id_to_category_map[token_ids]
-    def get_banned_tokens_mask(self, tokens: Union[str, List[str], int, List[int]]):
-        if isinstance(tokens, str):
-            tokens = [tokens]
-        elif isinstance(tokens, int):
-            tokens = [tokens]
-        elif isinstance(tokens, list):
-            tokens = [  # type: ignore
-                self.convert_tokens_to_ids(token) if isinstance(token, str) else token
-                for token in tokens
-            ]
-        assert isinstance(tokens, list) and all(
-            [isinstance(token, int) for token in tokens]
-        )
-        mask = self.create_vocab_mask(value=1)
-        mask[tokens] = 0
-        return mask

 import logging
 import json
+from typing import Dict, List
 from pydantic.dataclasses import dataclass
 from transformers import PreTrainedTokenizerFast
 from tokenizers.decoders import Decoder
 logger = logging.getLogger(__name__)
+# fmt: off
+# https://huggingface.co/docs/transformers/main/en/chat_templating
+PROMPT_TEMPLATE = (
+    "{{ '<|bos|>' }}"
+    "{{ '<rating>' }}"
+    "{% if 'rating' not in messages or messages['rating'] is none %}"
+    "{{ 'rating:sfw, rating:general' }}"
+    "{% else %}"
+    "{{ messages['rating'] }}"
+    "{% endif %}"
+    "{{ '</rating>' }}"
+    "{{ '<copyright>' }}"
+    "{% if 'copyright' not in messages or messages['copyright'] is none %}"
+    "{{ '' }}"
+    "{% else %}"
+    "{{ messages['copyright'] }}"
+    "{% endif %}"
+    "{{ '</copyright>' }}"
+    "{{ '<character>' }}"
+    "{% if 'character' not in messages or messages['character'] is none %}"
+    "{{ '' }}"
+    "{% else %}"
+    "{{ messages['character'] }}"
+    "{% endif %}"
+    "{{ '</character>' }}"
+    "{{ '<general>' }}"
+    "{% if 'general' not in messages or messages['general'] is none %}"
+    "{{ '' }}"
+    "{% else %}"
+    "{{ messages['general'] }}"
+    "{% endif %}"
+).strip()
+# fmt: on
 @dataclass
 class DartTokenizer(PreTrainedTokenizerFast):
     """Dart tokenizer"""
+    def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self._tokenizer.decoder = Decoder.custom(  # type: ignore
             DartDecoder(list(self.get_added_vocab().keys()))
         )
+    @property
+    def default_chat_template(self):
+        """
+        Danbooru Tags Transformer uses special format prompt to generate danbooru tags.
+        """
+        return PROMPT_TEMPLATE