Spaces:
Sleeping
Sleeping
| from llama_index.core import ( | |
| VectorStoreIndex | |
| ) | |
| from llama_index.core import Settings | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.vector_stores.qdrant import QdrantVectorStore | |
| from qdrant_client import QdrantClient | |
| from typing import Any, List, Tuple | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForMaskedLM | |
| import streamlit as st | |
| from llama_index.llms.huggingface import ( | |
| HuggingFaceInferenceAPI | |
| ) | |
| import os | |
| HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
| Q_END_POINT = os.environ.get("Q_END_POINT") | |
| Q_API_KEY = os.environ.get("Q_API_KEY") | |
| #DOC | |
| #https://docs.llamaindex.ai/en/stable/examples/vector_stores/qdrant_hybrid.html | |
| doc_tokenizer = AutoTokenizer.from_pretrained( | |
| "naver/efficient-splade-VI-BT-large-doc" | |
| ) | |
| doc_model = AutoModelForMaskedLM.from_pretrained( | |
| "naver/efficient-splade-VI-BT-large-doc" | |
| ) | |
| query_tokenizer = AutoTokenizer.from_pretrained( | |
| "naver/efficient-splade-VI-BT-large-query" | |
| ) | |
| query_model = AutoModelForMaskedLM.from_pretrained( | |
| "naver/efficient-splade-VI-BT-large-query" | |
| ) | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| doc_model = doc_model.to(device) | |
| query_model = query_model.to(device) | |
| def sparse_doc_vectors( | |
| texts: List[str], | |
| ) -> Tuple[List[List[int]], List[List[float]]]: | |
| """ | |
| Computes vectors from logits and attention mask using ReLU, log, and max operations. | |
| """ | |
| tokens = doc_tokenizer( | |
| texts, truncation=True, padding=True, return_tensors="pt" | |
| ) | |
| if torch.cuda.is_available(): | |
| tokens = tokens.to("cuda:1") | |
| output = doc_model(**tokens) | |
| logits, attention_mask = output.logits, tokens.attention_mask | |
| relu_log = torch.log(1 + torch.relu(logits)) | |
| weighted_log = relu_log * attention_mask.unsqueeze(-1) | |
| tvecs, _ = torch.max(weighted_log, dim=1) | |
| # extract the vectors that are non-zero and their indices | |
| indices = [] | |
| vecs = [] | |
| for batch in tvecs: | |
| indices.append(batch.nonzero(as_tuple=True)[0].tolist()) | |
| vecs.append(batch[indices[-1]].tolist()) | |
| return indices, vecs | |
| def sparse_query_vectors( | |
| texts: List[str], | |
| ) -> Tuple[List[List[int]], List[List[float]]]: | |
| """ | |
| Computes vectors from logits and attention mask using ReLU, log, and max operations. | |
| """ | |
| # TODO: compute sparse vectors in batches if max length is exceeded | |
| tokens = query_tokenizer( | |
| texts, truncation=True, padding=True, return_tensors="pt" | |
| ) | |
| if torch.cuda.is_available(): | |
| tokens = tokens.to("cuda:1") | |
| output = query_model(**tokens) | |
| logits, attention_mask = output.logits, tokens.attention_mask | |
| relu_log = torch.log(1 + torch.relu(logits)) | |
| weighted_log = relu_log * attention_mask.unsqueeze(-1) | |
| tvecs, _ = torch.max(weighted_log, dim=1) | |
| # extract the vectors that are non-zero and their indices | |
| indices = [] | |
| vecs = [] | |
| for batch in tvecs: | |
| indices.append(batch.nonzero(as_tuple=True)[0].tolist()) | |
| vecs.append(batch[indices[-1]].tolist()) | |
| return indices, vecs | |
| st.header("Chat with the Bible docs 💬 📚") | |
| if "messages" not in st.session_state.keys(): # Initialize the chat message history | |
| st.session_state.messages = [ | |
| {"role": "assistant", "content": "Ask me a question about Bible!"} | |
| ] | |
| # creates a persistant index to disk | |
| client = QdrantClient( | |
| Q_END_POINT, | |
| api_key=Q_API_KEY, | |
| ) | |
| # create our vector store with hybrid indexing enabled | |
| # batch_size controls how many nodes are encoded with sparse vectors at once | |
| vector_store = QdrantVectorStore( | |
| "bible", client=client, enable_hybrid=True, batch_size=20,force_disable_check_same_thread=True, | |
| sparse_doc_fn=sparse_doc_vectors, | |
| sparse_query_fn=sparse_query_vectors, | |
| ) | |
| llm = HuggingFaceInferenceAPI( | |
| model_name="meta-llama/Meta-Llama-3-8B-Instruct", | |
| token=HUGGINGFACEHUB_API_TOKEN, | |
| context_window=8096, | |
| ) | |
| Settings.llm = llm | |
| Settings.tokenzier = AutoTokenizer.from_pretrained( | |
| "meta-llama/Meta-Llama-3-8B-Instruct" | |
| ) | |
| embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device="cpu") | |
| Settings.embed_model = embed_model | |
| index = VectorStoreIndex.from_vector_store(vector_store=vector_store,embed_model=embed_model) | |
| from llama_index.core.memory import ChatMemoryBuffer | |
| memory = ChatMemoryBuffer.from_defaults(token_limit=1500) | |
| chat_engine = index.as_chat_engine(chat_mode="condense_question", | |
| verbose=True, | |
| memory=memory, | |
| sparse_top_k=10, | |
| vector_store_query_mode="hybrid", | |
| similarity_top_k=3, | |
| ) | |
| if prompt := st.chat_input("Your question"): # Prompt for user input and save to chat history | |
| st.session_state.messages.append({"role": "user", "content": prompt}) | |
| for message in st.session_state.messages: # Display the prior chat messages | |
| with st.chat_message(message["role"]): | |
| st.write(message["content"]) | |
| # If last message is not from assistant, generate a new response | |
| if st.session_state.messages[-1]["role"] != "assistant": | |
| with st.chat_message("assistant"): | |
| with st.spinner("Thinking..."): | |
| response = chat_engine.chat(prompt) | |
| st.write(response.response) | |
| message = {"role": "assistant", "content": response.response} | |
| st.session_state.messages.append(message) # Add response to message history | |