Tom Aarsen
commited on
Commit
·
a9de66e
1
Parent(s):
a62d6ce
embeddings models -> embedding models
Browse files
README.md
CHANGED
|
@@ -42,23 +42,23 @@ from sentence_transformers import SentenceTransformer, util
|
|
| 42 |
query = "How many people live in London?"
|
| 43 |
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
| 44 |
|
| 45 |
-
#Load the model
|
| 46 |
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')
|
| 47 |
|
| 48 |
-
#Encode query and documents
|
| 49 |
query_emb = model.encode(query)
|
| 50 |
doc_emb = model.encode(docs)
|
| 51 |
|
| 52 |
-
#Compute dot score between query and all document embeddings
|
| 53 |
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
|
| 54 |
|
| 55 |
-
#Combine docs & scores
|
| 56 |
doc_score_pairs = list(zip(docs, scores))
|
| 57 |
|
| 58 |
-
#Sort by decreasing score
|
| 59 |
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
|
| 60 |
|
| 61 |
-
#Output passages & scores
|
| 62 |
for doc, score in doc_score_pairs:
|
| 63 |
print(score, doc)
|
| 64 |
```
|
|
@@ -71,11 +71,11 @@ Without [sentence-transformers](https://www.SBERT.net), you can use the model li
|
|
| 71 |
from transformers import AutoTokenizer, AutoModel
|
| 72 |
import torch
|
| 73 |
|
| 74 |
-
#CLS Pooling - Take output from first token
|
| 75 |
def cls_pooling(model_output):
|
| 76 |
return model_output.last_hidden_state[:,0]
|
| 77 |
|
| 78 |
-
#Encode text
|
| 79 |
def encode(texts):
|
| 80 |
# Tokenize sentences
|
| 81 |
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
|
|
@@ -98,27 +98,27 @@ docs = ["Around 9 Million people live in London", "London is known for its finan
|
|
| 98 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 99 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 100 |
|
| 101 |
-
#Encode query and docs
|
| 102 |
query_emb = encode(query)
|
| 103 |
doc_emb = encode(docs)
|
| 104 |
|
| 105 |
-
#Compute dot score between query and all document embeddings
|
| 106 |
scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
|
| 107 |
|
| 108 |
-
#Combine docs & scores
|
| 109 |
doc_score_pairs = list(zip(docs, scores))
|
| 110 |
|
| 111 |
-
#Sort by decreasing score
|
| 112 |
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
|
| 113 |
|
| 114 |
-
#Output passages & scores
|
| 115 |
for doc, score in doc_score_pairs:
|
| 116 |
print(score, doc)
|
| 117 |
```
|
| 118 |
|
| 119 |
## Usage (Text Embeddings Inference (TEI))
|
| 120 |
|
| 121 |
-
[Text Embeddings Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) is a blazing fast inference solution for text
|
| 122 |
|
| 123 |
- CPU:
|
| 124 |
```bash
|
|
|
|
| 42 |
query = "How many people live in London?"
|
| 43 |
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
|
| 44 |
|
| 45 |
+
# Load the model
|
| 46 |
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')
|
| 47 |
|
| 48 |
+
# Encode query and documents
|
| 49 |
query_emb = model.encode(query)
|
| 50 |
doc_emb = model.encode(docs)
|
| 51 |
|
| 52 |
+
# Compute dot score between query and all document embeddings
|
| 53 |
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
|
| 54 |
|
| 55 |
+
# Combine docs & scores
|
| 56 |
doc_score_pairs = list(zip(docs, scores))
|
| 57 |
|
| 58 |
+
# Sort by decreasing score
|
| 59 |
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
|
| 60 |
|
| 61 |
+
# Output passages & scores
|
| 62 |
for doc, score in doc_score_pairs:
|
| 63 |
print(score, doc)
|
| 64 |
```
|
|
|
|
| 71 |
from transformers import AutoTokenizer, AutoModel
|
| 72 |
import torch
|
| 73 |
|
| 74 |
+
# CLS Pooling - Take output from first token
|
| 75 |
def cls_pooling(model_output):
|
| 76 |
return model_output.last_hidden_state[:,0]
|
| 77 |
|
| 78 |
+
# Encode text
|
| 79 |
def encode(texts):
|
| 80 |
# Tokenize sentences
|
| 81 |
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
|
|
|
|
| 98 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 99 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 100 |
|
| 101 |
+
# Encode query and docs
|
| 102 |
query_emb = encode(query)
|
| 103 |
doc_emb = encode(docs)
|
| 104 |
|
| 105 |
+
# Compute dot score between query and all document embeddings
|
| 106 |
scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
|
| 107 |
|
| 108 |
+
# Combine docs & scores
|
| 109 |
doc_score_pairs = list(zip(docs, scores))
|
| 110 |
|
| 111 |
+
# Sort by decreasing score
|
| 112 |
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
|
| 113 |
|
| 114 |
+
# Output passages & scores
|
| 115 |
for doc, score in doc_score_pairs:
|
| 116 |
print(score, doc)
|
| 117 |
```
|
| 118 |
|
| 119 |
## Usage (Text Embeddings Inference (TEI))
|
| 120 |
|
| 121 |
+
[Text Embeddings Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) is a blazing fast inference solution for text embedding models.
|
| 122 |
|
| 123 |
- CPU:
|
| 124 |
```bash
|