| """ | |
| Load embedding models from huggingface. | |
| """ | |
| import torch | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| def get_hf_embeddings(model_name=None): | |
| """Get huggingface embedding by name.""" | |
| if model_name is None: | |
| # Some candiates | |
| # "BAAI/bge-m3" (good, though large and slow) | |
| # "BAAI/bge-base-en-v1.5" -> also good | |
| # "sentence-transformers/all-mpnet-base-v2" | |
| # "maidalun1020/bce-embedding-base_v1" | |
| # "intfloat/multilingual-e5-large" | |
| # Ref: https://huggingface.co/spaces/mteb/leaderboard | |
| # https://huggingface.co/maidalun1020/bce-embedding-base_v1 | |
| model_name = "BAAI/bge-large-en-v1.5" | |
| embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
| return embeddings | |
| def get_jinaai_embeddings( | |
| model_name="jinaai/jina-embeddings-v2-base-en", device="auto" | |
| ): | |
| """Get jinaai embedding.""" | |
| # device: cpu or cuda | |
| if device == "auto": | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # For jinaai. Ref: https://github.com/langchain-ai/langchain/issues/6080 | |
| from transformers import AutoModel | |
| model = AutoModel.from_pretrained( | |
| model_name, trust_remote_code=True | |
| ) # -> will yield error, need bug fixing | |
| model_name = model_name | |
| model_kwargs = {"device": device, "trust_remote_code": True} | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name=model_name, | |
| model_kwargs=model_kwargs, | |
| ) | |
| return embeddings | |