Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from streamlit.logger import get_logger | |
| import datasets | |
| import pandas as pd | |
| from langchain_huggingface.embeddings import HuggingFaceEmbeddings | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.messages import HumanMessage, SystemMessage | |
| from sentence_transformers import util | |
| from torch import tensor | |
| LOGGER = get_logger(__name__) | |
| def get_df() ->object: | |
| ds = datasets.load_dataset('sivan22/yalkut-yosef-embeddings') | |
| df = pd.DataFrame.from_dict(ds['train']) | |
| return df | |
| def get_model()->object: | |
| model_name = "intfloat/multilingual-e5-large" | |
| model_kwargs = {'device': 'cpu'} #'cpu' or 'cuda' | |
| encode_kwargs = {'normalize_embeddings': False} | |
| embeddings_model = HuggingFaceEmbeddings( | |
| model_name=model_name, | |
| model_kwargs=model_kwargs, | |
| encode_kwargs=encode_kwargs | |
| ) | |
| return embeddings_model | |
| def get_chat_api(api_key:str): | |
| chat = ChatOpenAI(model="gpt-3.5-turbo-16k", api_key=api_key) | |
| return chat | |
| def get_results(embeddings_model,input,df,num_of_results) -> pd.DataFrame: | |
| embeddings = embeddings_model.embed_query('query: '+ input) | |
| hits = util.semantic_search(tensor(embeddings), tensor(df['embeddings'].tolist()), top_k=num_of_results) | |
| hit_list = [hit['corpus_id'] for hit in hits[0]] | |
| return df.iloc[hit_list] | |
| def get_llm_results(query,chat,results): | |
| prompt_template = PromptTemplate.from_template( | |
| """ | |
| your misssion is to rank the given answers based on their relevance to the given question. | |
| Provide a relevancy score between 0 (not relevant) and 1 (highly relevant) for each possible answer. | |
| the results should be in the following JSON format: "answer": "score", "answer": "score" while answer is the possible answer's text and score is the relevancy score. | |
| the question is: {query} | |
| the possible answers are: | |
| {answers} | |
| """ ) | |
| messages = [ | |
| SystemMessage(content=""" | |
| You're a helpful assistant. | |
| Return a JSON formatted string. | |
| """), | |
| HumanMessage(content=prompt_template.format(query=query, answers=str.join('\n', results['text'].head(10).tolist()))), | |
| ] | |
| response = chat.invoke(messages) | |
| llm_results_df = pd.read_json(response.content, orient='index') | |
| llm_results_df.rename(columns={0: 'score'}, inplace=True) | |
| llm_results_df.sort_values(by='score', ascending=False, inplace=True) | |
| return llm_results_df | |
| def run(): | |
| st.set_page_config( | |
| page_title=" ืืืคืืฉ ืกืื ืื ืืืืงืื ืืืกืฃ", | |
| page_icon="๐", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| st.write("# ืืืคืืฉ ืืื ืืกืคืจ ืืืงืื ืืืกืฃ ืงืืฆืืจ ืฉืืืื ืขืจืื") | |
| embeddings_model = get_model() | |
| df = get_df() | |
| user_input = st.text_input('ืืชืื ืืื ืืช ืฉืืืชื', placeholder='ืืื ื ืจืืช ืืืืืงืื ืืื ืืืื ืืืืืืช ืืื ืืื') | |
| num_of_results = st.sidebar.slider('ืืกืคืจ ืืชืืฆืืืช ืฉืืจืฆืื ื ืืืฆืื:',1,25,5) | |
| use_llm = st.sidebar.checkbox("ืืฉืชืืฉ ืืืืื ืฉืคื ืืื ืืฉืคืจ ืชืืฆืืืช", False) | |
| openAikey = st.sidebar.text_input("OpenAI API key", type="password") | |
| if (st.button('ืืคืฉ') or user_input) and user_input!="": | |
| results = get_results(embeddings_model,user_input,df,num_of_results) | |
| if use_llm: | |
| if openAikey == None or openAikey=="": | |
| st.write("ืื ืืืื ืก ืืคืชื ืฉื OpenAI") | |
| else: | |
| chat = get_chat_api(openAikey) | |
| llm_results = get_llm_results(user_input,chat,results) | |
| st.write(llm_results) | |
| else: | |
| st.write(results[['siman','sek','text']].head(10)) | |
| if __name__ == "__main__": | |
| run() | |