litagin commited on
Commit
f54823d
·
verified ·
1 Parent(s): d25dff5

Delete bert

Browse files
bert/bert_models.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "deberta-v2-large-japanese-char-wwm": {
3
- "repo_id": "ku-nlp/deberta-v2-large-japanese-char-wwm",
4
- "files": ["pytorch_model.bin"]
5
- },
6
- "chinese-roberta-wwm-ext-large": {
7
- "repo_id": "hfl/chinese-roberta-wwm-ext-large",
8
- "files": ["pytorch_model.bin"]
9
- },
10
- "deberta-v3-large": {
11
- "repo_id": "microsoft/deberta-v3-large",
12
- "files": ["spm.model", "pytorch_model.bin"]
13
- }
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bert/deberta-v2-large-japanese-char-wwm/.gitattributes DELETED
@@ -1,34 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bert/deberta-v2-large-japanese-char-wwm/README.md DELETED
@@ -1,89 +0,0 @@
1
- ---
2
- language: ja
3
- license: cc-by-sa-4.0
4
- library_name: transformers
5
- tags:
6
- - deberta
7
- - deberta-v2
8
- - fill-mask
9
- - character
10
- - wwm
11
- datasets:
12
- - wikipedia
13
- - cc100
14
- - oscar
15
- metrics:
16
- - accuracy
17
- mask_token: "[MASK]"
18
- widget:
19
- - text: "京都大学で自然言語処理を[MASK][MASK]する。"
20
- ---
21
-
22
- # Model Card for Japanese character-level DeBERTa V2 large
23
-
24
- ## Model description
25
-
26
- This is a Japanese DeBERTa V2 large model pre-trained on Japanese Wikipedia, the Japanese portion of CC-100, and the Japanese portion of OSCAR.
27
- This model is trained with character-level tokenization and whole word masking.
28
-
29
- ## How to use
30
-
31
- You can use this model for masked language modeling as follows:
32
-
33
- ```python
34
- from transformers import AutoTokenizer, AutoModelForMaskedLM
35
- tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
36
- model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
37
-
38
- sentence = '京都大学で自然言語処理を[MASK][MASK]する。'
39
- encoding = tokenizer(sentence, return_tensors='pt')
40
- ...
41
- ```
42
-
43
- You can also fine-tune this model on downstream tasks.
44
-
45
- ## Tokenization
46
-
47
- There is no need to tokenize texts in advance, and you can give raw texts to the tokenizer.
48
- The texts are tokenized into character-level tokens by [sentencepiece](https://github.com/google/sentencepiece).
49
-
50
- ## Training data
51
-
52
- We used the following corpora for pre-training:
53
-
54
- - Japanese Wikipedia (as of 20221020, 3.2GB, 27M sentences, 1.3M documents)
55
- - Japanese portion of CC-100 (85GB, 619M sentences, 66M documents)
56
- - Japanese portion of OSCAR (54GB, 326M sentences, 25M documents)
57
-
58
- Note that we filtered out documents annotated with "header", "footer", or "noisy" tags in OSCAR.
59
- Also note that Japanese Wikipedia was duplicated 10 times to make the total size of the corpus comparable to that of CC-100 and OSCAR. As a result, the total size of the training data is 171GB.
60
-
61
- ## Training procedure
62
-
63
- We first segmented texts in the corpora into words using [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) for whole word masking.
64
- Then, we built a sentencepiece model with 22,012 tokens including all characters that appear in the training corpus.
65
-
66
- We tokenized raw corpora into character-level subwords using the sentencepiece model and trained the Japanese DeBERTa model using [transformers](https://github.com/huggingface/transformers) library.
67
- The training took 26 days using 16 NVIDIA A100-SXM4-40GB GPUs.
68
-
69
- The following hyperparameters were used during pre-training:
70
-
71
- - learning_rate: 1e-4
72
- - per_device_train_batch_size: 26
73
- - distributed_type: multi-GPU
74
- - num_devices: 16
75
- - gradient_accumulation_steps: 8
76
- - total_train_batch_size: 3,328
77
- - max_seq_length: 512
78
- - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-06
79
- - lr_scheduler_type: linear schedule with warmup (lr = 0 at 300k steps)
80
- - training_steps: 260,000
81
- - warmup_steps: 10,000
82
-
83
- The accuracy of the trained model on the masked language modeling task was 0.795.
84
- The evaluation set consists of 5,000 randomly sampled documents from each of the training corpora.
85
-
86
- ## Acknowledgments
87
-
88
- This work was supported by Joint Usage/Research Center for Interdisciplinary Large-scale Information Infrastructures (JHPCN) through General Collaboration Project no. jh221004, "Developing a Platform for Constructing and Sharing of Large-Scale Japanese Language Models".
89
- For training models, we used the mdx: a platform for the data-driven future.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bert/deberta-v2-large-japanese-char-wwm/config.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "architectures": [
3
- "DebertaV2ForMaskedLM"
4
- ],
5
- "attention_head_size": 64,
6
- "attention_probs_dropout_prob": 0.1,
7
- "conv_act": "gelu",
8
- "conv_kernel_size": 3,
9
- "hidden_act": "gelu",
10
- "hidden_dropout_prob": 0.1,
11
- "hidden_size": 1024,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 4096,
14
- "layer_norm_eps": 1e-07,
15
- "max_position_embeddings": 512,
16
- "max_relative_positions": -1,
17
- "model_type": "deberta-v2",
18
- "norm_rel_ebd": "layer_norm",
19
- "num_attention_heads": 16,
20
- "num_hidden_layers": 24,
21
- "pad_token_id": 0,
22
- "pooler_dropout": 0,
23
- "pooler_hidden_act": "gelu",
24
- "pooler_hidden_size": 1024,
25
- "pos_att_type": [
26
- "p2c",
27
- "c2p"
28
- ],
29
- "position_biased_input": false,
30
- "position_buckets": 256,
31
- "relative_attention": true,
32
- "share_att_key": true,
33
- "torch_dtype": "float16",
34
- "transformers_version": "4.25.1",
35
- "type_vocab_size": 0,
36
- "vocab_size": 22012
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "cls_token": "[CLS]",
3
- "mask_token": "[MASK]",
4
- "pad_token": "[PAD]",
5
- "sep_token": "[SEP]",
6
- "unk_token": "[UNK]"
7
- }
 
 
 
 
 
 
 
 
bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "cls_token": "[CLS]",
3
- "do_lower_case": false,
4
- "do_subword_tokenize": true,
5
- "do_word_tokenize": true,
6
- "jumanpp_kwargs": null,
7
- "mask_token": "[MASK]",
8
- "mecab_kwargs": null,
9
- "model_max_length": 1000000000000000019884624838656,
10
- "never_split": null,
11
- "pad_token": "[PAD]",
12
- "sep_token": "[SEP]",
13
- "special_tokens_map_file": null,
14
- "subword_tokenizer_type": "character",
15
- "sudachi_kwargs": null,
16
- "tokenizer_class": "BertJapaneseTokenizer",
17
- "unk_token": "[UNK]",
18
- "word_tokenizer_type": "basic"
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bert/deberta-v2-large-japanese-char-wwm/vocab.txt DELETED
The diff for this file is too large to render. See raw diff