Update README.md
Browse files
README.md
CHANGED
|
@@ -60,11 +60,12 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|
| 60 |
|
| 61 |
|
| 62 |
# get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
|
| 63 |
-
def get_sparse_vector(feature, output):
|
| 64 |
values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
|
| 65 |
values = torch.log(1 + torch.relu(values))
|
| 66 |
values[:,special_token_ids] = 0
|
| 67 |
-
|
|
|
|
| 68 |
|
| 69 |
# transform the sparse vector to a dict of (token, weight)
|
| 70 |
def transform_sparse_vector_to_dict(sparse_vector):
|
|
@@ -127,7 +128,7 @@ document_sparse_vector = get_sparse_vector(feature_document, output)
|
|
| 127 |
|
| 128 |
# get similarity score
|
| 129 |
sim_score = torch.matmul(query_sparse_vector[0],document_sparse_vector[0])
|
| 130 |
-
print(sim_score) # tensor(7.
|
| 131 |
|
| 132 |
|
| 133 |
query_token_weight = transform_sparse_vector_to_dict(query_sparse_vector)[0]
|
|
@@ -143,7 +144,6 @@ for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reve
|
|
| 143 |
# score in query: 1.6406, score in document: 0.9018, token: now
|
| 144 |
# score in query: 1.6108, score in document: 0.3141, token: ?
|
| 145 |
# score in query: 1.2721, score in document: 1.3446, token: ny
|
| 146 |
-
# score in query: 0.6005, score in document: 0.1804, token: in
|
| 147 |
```
|
| 148 |
|
| 149 |
The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
# get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
|
| 63 |
+
def get_sparse_vector(feature, output, prune_ratio=0.1):
|
| 64 |
values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
|
| 65 |
values = torch.log(1 + torch.relu(values))
|
| 66 |
values[:,special_token_ids] = 0
|
| 67 |
+
max_values = values.max(dim=-1)[0].unsqueeze(1) * prune_ratio
|
| 68 |
+
return values * (values > max_values)
|
| 69 |
|
| 70 |
# transform the sparse vector to a dict of (token, weight)
|
| 71 |
def transform_sparse_vector_to_dict(sparse_vector):
|
|
|
|
| 128 |
|
| 129 |
# get similarity score
|
| 130 |
sim_score = torch.matmul(query_sparse_vector[0],document_sparse_vector[0])
|
| 131 |
+
print(sim_score) # tensor(7.6317, grad_fn=<DotBackward0>)
|
| 132 |
|
| 133 |
|
| 134 |
query_token_weight = transform_sparse_vector_to_dict(query_sparse_vector)[0]
|
|
|
|
| 144 |
# score in query: 1.6406, score in document: 0.9018, token: now
|
| 145 |
# score in query: 1.6108, score in document: 0.3141, token: ?
|
| 146 |
# score in query: 1.2721, score in document: 1.3446, token: ny
|
|
|
|
| 147 |
```
|
| 148 |
|
| 149 |
The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.
|