First update
Browse files- .gitattributes +1 -0
- README.md +214 -1
- config.json +3 -0
- optimizer.pt +3 -0
- pytorch_model.bin +3 -0
- rng_state.pth +3 -0
- scheduler.pt +3 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +3 -0
- tokenizer.json +3 -0
- tokenizer_config.json +3 -0
- trainer_state.json +3 -0
- training_args.bin +3 -0
    	
        .gitattributes
    CHANGED
    
    | @@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| 25 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 26 | 
             
            *.zstandard filter=lfs diff=lfs merge=lfs -text
         | 
| 27 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
|  | 
|  | |
| 25 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 26 | 
             
            *.zstandard filter=lfs diff=lfs merge=lfs -text
         | 
| 27 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 28 | 
            +
            *.json filter=lfs diff=lfs merge=lfs -text
         | 
    	
        README.md
    CHANGED
    
    | @@ -1,3 +1,216 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 3 | 
             
            ---
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            tags:
         | 
| 3 | 
            +
            - Transformers
         | 
| 4 | 
            +
            - text-classification
         | 
| 5 | 
            +
            - multi-class-classification
         | 
| 6 | 
            +
            languages:
         | 
| 7 | 
            +
            - af-ZA
         | 
| 8 | 
            +
            - am-ET
         | 
| 9 | 
            +
            - ar-SA
         | 
| 10 | 
            +
            - az-AZ
         | 
| 11 | 
            +
            - bn-BD
         | 
| 12 | 
            +
            - cy-GB
         | 
| 13 | 
            +
            - da-DK
         | 
| 14 | 
            +
            - de-DE
         | 
| 15 | 
            +
            - el-GR
         | 
| 16 | 
            +
            - en-US
         | 
| 17 | 
            +
            - es-ES
         | 
| 18 | 
            +
            - fa-IR
         | 
| 19 | 
            +
            - fi-FI
         | 
| 20 | 
            +
            - fr-FR
         | 
| 21 | 
            +
            - he-IL
         | 
| 22 | 
            +
            - hi-IN
         | 
| 23 | 
            +
            - hu-HU
         | 
| 24 | 
            +
            - hy-AM
         | 
| 25 | 
            +
            - id-ID
         | 
| 26 | 
            +
            - is-IS
         | 
| 27 | 
            +
            - it-IT
         | 
| 28 | 
            +
            - ja-JP
         | 
| 29 | 
            +
            - jv-ID
         | 
| 30 | 
            +
            - ka-GE
         | 
| 31 | 
            +
            - km-KH
         | 
| 32 | 
            +
            - kn-IN
         | 
| 33 | 
            +
            - ko-KR
         | 
| 34 | 
            +
            - lv-LV
         | 
| 35 | 
            +
            - ml-IN
         | 
| 36 | 
            +
            - mn-MN
         | 
| 37 | 
            +
            - ms-MY
         | 
| 38 | 
            +
            - my-MM
         | 
| 39 | 
            +
            - nb-NO
         | 
| 40 | 
            +
            - nl-NL
         | 
| 41 | 
            +
            - pl-PL
         | 
| 42 | 
            +
            - pt-PT
         | 
| 43 | 
            +
            - ro-RO
         | 
| 44 | 
            +
            - ru-RU
         | 
| 45 | 
            +
            - sl-SL
         | 
| 46 | 
            +
            - sq-AL
         | 
| 47 | 
            +
            - sv-SE
         | 
| 48 | 
            +
            - sw-KE
         | 
| 49 | 
            +
            - ta-IN
         | 
| 50 | 
            +
            - te-IN
         | 
| 51 | 
            +
            - th-TH
         | 
| 52 | 
            +
            - tl-PH
         | 
| 53 | 
            +
            - tr-TR
         | 
| 54 | 
            +
            - ur-PK
         | 
| 55 | 
            +
            - vi-VN
         | 
| 56 | 
            +
            - zh-CN
         | 
| 57 | 
            +
            - zh-TW
         | 
| 58 | 
            +
            multilinguality:
         | 
| 59 | 
            +
            - af-ZA
         | 
| 60 | 
            +
            - am-ET
         | 
| 61 | 
            +
            - ar-SA
         | 
| 62 | 
            +
            - az-AZ
         | 
| 63 | 
            +
            - bn-BD
         | 
| 64 | 
            +
            - cy-GB
         | 
| 65 | 
            +
            - da-DK
         | 
| 66 | 
            +
            - de-DE
         | 
| 67 | 
            +
            - el-GR
         | 
| 68 | 
            +
            - en-US
         | 
| 69 | 
            +
            - es-ES
         | 
| 70 | 
            +
            - fa-IR
         | 
| 71 | 
            +
            - fi-FI
         | 
| 72 | 
            +
            - fr-FR
         | 
| 73 | 
            +
            - he-IL
         | 
| 74 | 
            +
            - hi-IN
         | 
| 75 | 
            +
            - hu-HU
         | 
| 76 | 
            +
            - hy-AM
         | 
| 77 | 
            +
            - id-ID
         | 
| 78 | 
            +
            - is-IS
         | 
| 79 | 
            +
            - it-IT
         | 
| 80 | 
            +
            - ja-JP
         | 
| 81 | 
            +
            - jv-ID
         | 
| 82 | 
            +
            - ka-GE
         | 
| 83 | 
            +
            - km-KH
         | 
| 84 | 
            +
            - kn-IN
         | 
| 85 | 
            +
            - ko-KR
         | 
| 86 | 
            +
            - lv-LV
         | 
| 87 | 
            +
            - ml-IN
         | 
| 88 | 
            +
            - mn-MN
         | 
| 89 | 
            +
            - ms-MY
         | 
| 90 | 
            +
            - my-MM
         | 
| 91 | 
            +
            - nb-NO
         | 
| 92 | 
            +
            - nl-NL
         | 
| 93 | 
            +
            - pl-PL
         | 
| 94 | 
            +
            - pt-PT
         | 
| 95 | 
            +
            - ro-RO
         | 
| 96 | 
            +
            - ru-RU
         | 
| 97 | 
            +
            - sl-SL
         | 
| 98 | 
            +
            - sq-AL
         | 
| 99 | 
            +
            - sv-SE
         | 
| 100 | 
            +
            - sw-KE
         | 
| 101 | 
            +
            - ta-IN
         | 
| 102 | 
            +
            - te-IN
         | 
| 103 | 
            +
            - th-TH
         | 
| 104 | 
            +
            - tl-PH
         | 
| 105 | 
            +
            - tr-TR
         | 
| 106 | 
            +
            - ur-PK
         | 
| 107 | 
            +
            - vi-VN
         | 
| 108 | 
            +
            - zh-CN
         | 
| 109 | 
            +
            - zh-TW
         | 
| 110 | 
            +
            datasets:
         | 
| 111 | 
            +
            - qanastek/MASSIVE
         | 
| 112 | 
            +
            widget:
         | 
| 113 | 
            +
            - text: "wake me up at five am this week"
         | 
| 114 | 
            +
            - text: "je veux écouter la chanson de jacques brel encore une fois"
         | 
| 115 | 
            +
            - text: "quiero escuchar la canción de arijit singh una vez más"
         | 
| 116 | 
            +
            - text: "olly onde é que á um parque por perto onde eu possa correr"
         | 
| 117 | 
            +
            - text: "פרק הבא בפודקאסט בבקשה"
         | 
| 118 | 
            +
            - text: "亚马逊股价"
         | 
| 119 | 
            +
            - text: "найди билет на поезд в санкт-петербург"
         | 
| 120 | 
            +
            license: cc-by-4.0
         | 
| 121 | 
             
            ---
         | 
| 122 | 
            +
             | 
| 123 | 
            +
            **People Involved**
         | 
| 124 | 
            +
             | 
| 125 | 
            +
            * [LABRAK Yanis](https://www.linkedin.com/in/yanis-labrak-8a7412145/) (1)
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            **Affiliations**
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            1. [LIA, NLP team](https://lia.univ-avignon.fr/), Avignon University, Avignon, France.
         | 
| 130 | 
            +
             | 
| 131 | 
            +
            ## Demo: How to use in HuggingFace Transformers Pipeline
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            Requires [transformers](https://pypi.org/project/transformers/): ```pip install transformers```
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            ```python
         | 
| 136 | 
            +
            from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
         | 
| 137 | 
            +
            model_name = 'qanastek/51-languages-classifier'
         | 
| 138 | 
            +
            tokenizer = AutoTokenizer.from_pretrained(model_name)
         | 
| 139 | 
            +
            model = AutoModelForSequenceClassification.from_pretrained(model_name)
         | 
| 140 | 
            +
            classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
         | 
| 141 | 
            +
            res = classifier("פרק הבא בפודקאסט בבקשה")
         | 
| 142 | 
            +
            print(res)
         | 
| 143 | 
            +
            ```
         | 
| 144 | 
            +
             | 
| 145 | 
            +
            Outputs:
         | 
| 146 | 
            +
             | 
| 147 | 
            +
            ```python
         | 
| 148 | 
            +
            [{'label': 'fr-FR', 'score': 0.9998375177383423}]
         | 
| 149 | 
            +
            ```
         | 
| 150 | 
            +
             | 
| 151 | 
            +
            ## Training data
         | 
| 152 | 
            +
             | 
| 153 | 
            +
            [MASSIVE](https://huggingface.co/datasets/qanastek/MASSIVE) is a parallel dataset of > 1M utterances across 51 languages with annotations for the Natural Language Understanding tasks of intent prediction and slot annotation. Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.
         | 
| 154 | 
            +
             | 
| 155 | 
            +
             | 
| 156 | 
            +
            ## Evaluation results
         | 
| 157 | 
            +
             | 
| 158 | 
            +
            ```plain
         | 
| 159 | 
            +
                          precision    recall  f1-score   support
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                   af-ZA     0.9821    0.9805    0.9813      2974
         | 
| 162 | 
            +
                   am-ET     1.0000    1.0000    1.0000      2974
         | 
| 163 | 
            +
                   ar-SA     0.9809    0.9822    0.9815      2974
         | 
| 164 | 
            +
                   az-AZ     0.9946    0.9845    0.9895      2974
         | 
| 165 | 
            +
                   bn-BD     0.9997    0.9990    0.9993      2974
         | 
| 166 | 
            +
                   cy-GB     0.9970    0.9929    0.9949      2974
         | 
| 167 | 
            +
                   da-DK     0.9575    0.9617    0.9596      2974
         | 
| 168 | 
            +
                   de-DE     0.9906    0.9909    0.9908      2974
         | 
| 169 | 
            +
                   el-GR     0.9997    0.9973    0.9985      2974
         | 
| 170 | 
            +
                   en-US     0.9712    0.9866    0.9788      2974
         | 
| 171 | 
            +
                   es-ES     0.9825    0.9842    0.9834      2974
         | 
| 172 | 
            +
                   fa-IR     0.9940    0.9973    0.9956      2974
         | 
| 173 | 
            +
                   fi-FI     0.9943    0.9946    0.9945      2974
         | 
| 174 | 
            +
                   fr-FR     0.9963    0.9923    0.9943      2974
         | 
| 175 | 
            +
                   he-IL     1.0000    0.9997    0.9998      2974
         | 
| 176 | 
            +
                   hi-IN     1.0000    0.9980    0.9990      2974
         | 
| 177 | 
            +
                   hu-HU     0.9983    0.9950    0.9966      2974
         | 
| 178 | 
            +
                   hy-AM     1.0000    0.9993    0.9997      2974
         | 
| 179 | 
            +
                   id-ID     0.9319    0.9291    0.9305      2974
         | 
| 180 | 
            +
                   is-IS     0.9966    0.9943    0.9955      2974
         | 
| 181 | 
            +
                   it-IT     0.9698    0.9926    0.9811      2974
         | 
| 182 | 
            +
                   ja-JP     0.9987    0.9963    0.9975      2974
         | 
| 183 | 
            +
                   jv-ID     0.9628    0.9744    0.9686      2974
         | 
| 184 | 
            +
                   ka-GE     0.9993    0.9997    0.9995      2974
         | 
| 185 | 
            +
                   km-KH     0.9867    0.9963    0.9915      2974
         | 
| 186 | 
            +
                   kn-IN     1.0000    0.9993    0.9997      2974
         | 
| 187 | 
            +
                   ko-KR     0.9917    0.9997    0.9956      2974
         | 
| 188 | 
            +
                   lv-LV     0.9990    0.9950    0.9970      2974
         | 
| 189 | 
            +
                   ml-IN     0.9997    0.9997    0.9997      2974
         | 
| 190 | 
            +
                   mn-MN     0.9987    0.9966    0.9976      2974
         | 
| 191 | 
            +
                   ms-MY     0.9359    0.9418    0.9388      2974
         | 
| 192 | 
            +
                   my-MM     1.0000    0.9993    0.9997      2974
         | 
| 193 | 
            +
                   nb-NO     0.9600    0.9533    0.9566      2974
         | 
| 194 | 
            +
                   nl-NL     0.9850    0.9748    0.9799      2974
         | 
| 195 | 
            +
                   pl-PL     0.9946    0.9923    0.9934      2974
         | 
| 196 | 
            +
                   pt-PT     0.9885    0.9798    0.9841      2974
         | 
| 197 | 
            +
                   ro-RO     0.9919    0.9916    0.9918      2974
         | 
| 198 | 
            +
                   ru-RU     0.9976    0.9983    0.9980      2974
         | 
| 199 | 
            +
                   sl-SL     0.9956    0.9939    0.9948      2974
         | 
| 200 | 
            +
                   sq-AL     0.9936    0.9896    0.9916      2974
         | 
| 201 | 
            +
                   sv-SE     0.9902    0.9842    0.9872      2974
         | 
| 202 | 
            +
                   sw-KE     0.9867    0.9953    0.9910      2974
         | 
| 203 | 
            +
                   ta-IN     1.0000    1.0000    1.0000      2974
         | 
| 204 | 
            +
                   te-IN     1.0000    0.9997    0.9998      2974
         | 
| 205 | 
            +
                   th-TH     1.0000    0.9983    0.9992      2974
         | 
| 206 | 
            +
                   tl-PH     0.9929    0.9899    0.9914      2974
         | 
| 207 | 
            +
                   tr-TR     0.9869    0.9872    0.9871      2974
         | 
| 208 | 
            +
                   ur-PK     0.9983    0.9929    0.9956      2974
         | 
| 209 | 
            +
                   vi-VN     0.9993    0.9973    0.9983      2974
         | 
| 210 | 
            +
                   zh-CN     0.9812    0.9832    0.9822      2974
         | 
| 211 | 
            +
                   zh-TW     0.9832    0.9815    0.9823      2974
         | 
| 212 | 
            +
             | 
| 213 | 
            +
                accuracy                         0.9889    151674
         | 
| 214 | 
            +
               macro avg     0.9889    0.9889    0.9889    151674
         | 
| 215 | 
            +
            weighted avg     0.9889    0.9889    0.9889    151674
         | 
| 216 | 
            +
            ```
         | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:8b5e717ed1222ea2d1da259d79d0f844cbc139a1e5ba25387bc8c2c640b20668
         | 
| 3 | 
            +
            size 2912
         | 
    	
        optimizer.pt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:14503cad0cf8ef115210c825e30ac3aa6ef6b6af57f8b14b0df2c13fe9e8b270
         | 
| 3 | 
            +
            size 2224779869
         | 
    	
        pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:802a39a0e52e2ed6e1b90ce6ad7552f77c4e80370246c0e2d9473329ea256263
         | 
| 3 | 
            +
            size 1112403117
         | 
    	
        rng_state.pth
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:027717d8103c9237aa4ea05699c9898f259c407c504aacd1fd633b59abaf9ea8
         | 
| 3 | 
            +
            size 15523
         | 
    	
        scheduler.pt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:c713a6c8e565ba2920aeb9096491484c3dd4706e7096d1811dfb74dfceb2a4e2
         | 
| 3 | 
            +
            size 623
         | 
    	
        sentencepiece.bpe.model
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
         | 
| 3 | 
            +
            size 5069051
         | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:378eb3bf733eb16e65792d7e3fda5b8a4631387ca04d2015199c4d4f22ae554d
         | 
| 3 | 
            +
            size 239
         | 
    	
        tokenizer.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f2c509a525eb51aebb33fb59c24ee923c1d4c1db23c3ae81fe05ccf354084f7b
         | 
| 3 | 
            +
            size 17082758
         | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2a29b4422dd3e2b3311e0b5026f27f884f4c0b0ca566e8cd598025cf873f493d
         | 
| 3 | 
            +
            size 398
         | 
    	
        trainer_state.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:e4c42750649f65dbeebf47c794bfc8b6cb0896d2753e51afb069b4bb84a40226
         | 
| 3 | 
            +
            size 31133
         | 
    	
        training_args.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:51739794a631065dc342715b23b38cd583a2b306e00a2bb1ceab70162b38ba58
         | 
| 3 | 
            +
            size 3055
         | 
