nreimers
		
	commited on
		
		
					Commit 
							
							·
						
						08ec750
	
1
								Parent(s):
							
							c008614
								
upload
Browse files- README.md +78 -0
- config.json +23 -0
- pytorch_model.bin +3 -0
- sentence_bert_config.json +3 -0
- sentencepiece.bpe.model +0 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
    	
        README.md
    ADDED
    
    | @@ -0,0 +1,78 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Sentence Embeddings Models trained on Paraphrases
         | 
| 2 | 
            +
            This model is from the [sentence-transformers](https://github.com/UKPLab/sentence-transformers)-repository.  It was trained on SNLI + MultiNLI and on STS benchmark dataset. Further details on SBERT can be found in the paper:  [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://arxiv.org/abs/1908.10084)
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            This model is multilingual version, it was trained on parallel data for 50+ languages.
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            For more details, see: [SBERT.net - Pretrained Models](https://www.sbert.net/docs/pretrained_models.html)
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            ## Usage (HuggingFace Models Repository)
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            You can use the model directly from the model repository to compute sentence embeddings:
         | 
| 11 | 
            +
            ```python
         | 
| 12 | 
            +
            from transformers import AutoTokenizer, AutoModel
         | 
| 13 | 
            +
            import torch
         | 
| 14 | 
            +
             | 
| 15 | 
            +
             | 
| 16 | 
            +
            #Mean Pooling - Take attention mask into account for correct averaging
         | 
| 17 | 
            +
            def mean_pooling(model_output, attention_mask):
         | 
| 18 | 
            +
                token_embeddings = model_output[0] #First element of model_output contains all token embeddings
         | 
| 19 | 
            +
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         | 
| 20 | 
            +
                sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
         | 
| 21 | 
            +
                sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
         | 
| 22 | 
            +
                return sum_embeddings / sum_mask
         | 
| 23 | 
            +
             | 
| 24 | 
            +
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            #Sentences we want sentence embeddings for
         | 
| 27 | 
            +
            sentences = ['This framework generates embeddings for each input sentence',
         | 
| 28 | 
            +
                         'Sentences are passed as a list of string.',
         | 
| 29 | 
            +
                         'The quick brown fox jumps over the lazy dog.']
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            #Load AutoModel from huggingface model repository
         | 
| 32 | 
            +
            tokenizer = AutoTokenizer.from_pretrained("model_name")
         | 
| 33 | 
            +
            model = AutoModel.from_pretrained("model_name")
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            #Tokenize sentences
         | 
| 36 | 
            +
            encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            #Compute token embeddings
         | 
| 39 | 
            +
            with torch.no_grad():
         | 
| 40 | 
            +
                model_output = model(**encoded_input)
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            #Perform pooling. In this case, mean pooling
         | 
| 43 | 
            +
            sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
         | 
| 44 | 
            +
            ```
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            ## Usage (Sentence-Transformers)
         | 
| 47 | 
            +
            Using this model becomes more convenient when you have [sentence-transformers](https://github.com/UKPLab/sentence-transformers) installed:
         | 
| 48 | 
            +
            ```
         | 
| 49 | 
            +
            pip install -U sentence-transformers
         | 
| 50 | 
            +
            ```
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            Then you can use the model like this:
         | 
| 53 | 
            +
            ```python
         | 
| 54 | 
            +
            from sentence_transformers import SentenceTransformer
         | 
| 55 | 
            +
            model = SentenceTransformer('model_name')
         | 
| 56 | 
            +
            sentences = ['This framework generates embeddings for each input sentence',
         | 
| 57 | 
            +
                'Sentences are passed as a list of string.', 
         | 
| 58 | 
            +
                'The quick brown fox jumps over the lazy dog.']
         | 
| 59 | 
            +
            sentence_embeddings = model.encode(sentences)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            print("Sentence embeddings:")
         | 
| 62 | 
            +
            print(sentence_embeddings)
         | 
| 63 | 
            +
            ```
         | 
| 64 | 
            +
             | 
| 65 | 
            +
             | 
| 66 | 
            +
            ## Citing & Authors
         | 
| 67 | 
            +
            If you find this model helpful, feel free to cite our publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813):
         | 
| 68 | 
            +
            ``` 
         | 
| 69 | 
            +
            @inproceedings{reimers-2020-multilingual-sentence-bert,
         | 
| 70 | 
            +
                title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
         | 
| 71 | 
            +
                author = "Reimers, Nils and Gurevych, Iryna",
         | 
| 72 | 
            +
                booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing",
         | 
| 73 | 
            +
                month = "11",
         | 
| 74 | 
            +
                year = "2020",
         | 
| 75 | 
            +
                publisher = "Association for Computational Linguistics",
         | 
| 76 | 
            +
                url = "https://arxiv.org/abs/2004.09813",
         | 
| 77 | 
            +
            }
         | 
| 78 | 
            +
            ```
         | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,23 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "architectures": [
         | 
| 3 | 
            +
                "XLMRobertaModel"
         | 
| 4 | 
            +
              ],
         | 
| 5 | 
            +
              "attention_probs_dropout_prob": 0.1,
         | 
| 6 | 
            +
              "bos_token_id": 0,
         | 
| 7 | 
            +
              "eos_token_id": 2,
         | 
| 8 | 
            +
              "gradient_checkpointing": false,
         | 
| 9 | 
            +
              "hidden_act": "gelu",
         | 
| 10 | 
            +
              "hidden_dropout_prob": 0.1,
         | 
| 11 | 
            +
              "hidden_size": 768,
         | 
| 12 | 
            +
              "initializer_range": 0.02,
         | 
| 13 | 
            +
              "intermediate_size": 3072,
         | 
| 14 | 
            +
              "layer_norm_eps": 1e-05,
         | 
| 15 | 
            +
              "max_position_embeddings": 514,
         | 
| 16 | 
            +
              "model_type": "xlm-roberta",
         | 
| 17 | 
            +
              "num_attention_heads": 12,
         | 
| 18 | 
            +
              "num_hidden_layers": 12,
         | 
| 19 | 
            +
              "output_past": true,
         | 
| 20 | 
            +
              "pad_token_id": 1,
         | 
| 21 | 
            +
              "type_vocab_size": 1,
         | 
| 22 | 
            +
              "vocab_size": 250002
         | 
| 23 | 
            +
            }
         | 
    	
        pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:48315809d75adfbf8e9922ee0cdaaae26b4f6680ba8595d7ae50d67de848c830
         | 
| 3 | 
            +
            size 1112256686
         | 
    	
        sentence_bert_config.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "max_seq_length": 128
         | 
| 3 | 
            +
            }
         | 
    	
        sentencepiece.bpe.model
    ADDED
    
    | Binary file (5.07 MB). View file | 
|  | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}
         | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"model_max_length": 512, "special_tokens_map_file": "output/xlm-r-nli-stsb-40langs/0_Transformer/special_tokens_map.json", "full_tokenizer_file": null}
         | 
