Spaces:

akazakov
/

rag-gradio-sample-project

Paused

App Files Files Community

AlexanderKazakov commited on Nov 28, 2023

Commit

34b78ab

1 Parent(s): eeafaaa

add cross-encoder and HF API LLM

Browse files

Files changed (6) hide show

gradio_app/app.py +47 -18
gradio_app/backend/ChatGptInteractor.py +34 -32
gradio_app/backend/HuggingfaceGenerator.py +44 -0
gradio_app/backend/cross_encoder.py +32 -0
gradio_app/backend/query_llm.py +41 -145
settings.py +8 -2

gradio_app/app.py CHANGED Viewed

@@ -13,7 +13,8 @@ import markdown
 from jinja2 import Environment, FileSystemLoader
 from gradio_app.backend.ChatGptInteractor import num_tokens_from_messages
-from gradio_app.backend.query_llm import generate_hf, generate_openai, construct_openai_messages
 from gradio_app.backend.semantic_search import table, embedder
 from settings import *
@@ -45,42 +46,52 @@ def add_text(history, text):
     return history, gr.Textbox(value="", interactive=False)
-def bot(history, api_kind):
-    top_k_rank = 5
-    thresh_dist = 1.2
     history[-1][1] = ""
     query = history[-1][0]
     if not query:
-        gr.Warning("Please submit a non-empty string as a prompt")
-        raise ValueError("Empty string was submitted")
     logger.info('Retrieving documents...')
-    # Retrieve documents relevant to query
-    document_start = perf_counter()
     query_vec = embedder.embed(query)[0]
-    documents = table.search(query_vec, vector_column_name=VECTOR_COLUMN_NAME).limit(top_k_rank).to_list()
     thresh_dist = max(thresh_dist, min(d['_distance'] for d in documents))
     documents = [d for d in documents if d['_distance'] <= thresh_dist]
     documents = [doc[TEXT_COLUMN_NAME] for doc in documents]
-    document_time = perf_counter() - document_start
-    logger.info(f'Finished Retrieving documents in {round(document_time, 2)} seconds...')
     while len(documents) != 0:
         context = context_template.render(documents=documents)
         documents_html = [markdown.markdown(d) for d in documents]
         context_html = context_html_template.render(documents=documents_html)
-        messages = construct_openai_messages(context, history)
-        num_tokens = num_tokens_from_messages(messages, LLM_NAME)
-        if num_tokens + 512 < context_lengths[LLM_NAME]:
             break
         documents.pop()
     else:
         raise gr.Error('Model context length exceeded, reload the page')
-    for part in generate_openai(messages):
         history[-1][1] += part
         yield history, context_html
     else:
@@ -110,7 +121,25 @@ with gr.Blocks() as demo:
                 )
                 txt_btn = gr.Button(value="Submit text", scale=1)
-            api_kind = gr.Radio(choices=["HuggingFace", "OpenAI"], value="OpenAI", label='Backend')
             # Examples
             gr.Examples(examples, input_textbox)
@@ -122,7 +151,7 @@ with gr.Blocks() as demo:
     txt_msg = txt_btn.click(
         add_text, [chatbot, input_textbox], [chatbot, input_textbox], queue=False
     ).then(
-        bot, [chatbot, api_kind], [chatbot, context_html]
     )
     # Turn it back on
@@ -130,7 +159,7 @@ with gr.Blocks() as demo:
     # Turn off interactivity while generating if you hit enter
     txt_msg = input_textbox.submit(add_text, [chatbot, input_textbox], [chatbot, input_textbox], queue=False).then(
-        bot, [chatbot, api_kind], [chatbot, context_html])
     # Turn it back on
     txt_msg.then(lambda: gr.Textbox(interactive=True), None, [input_textbox], queue=False)

 from jinja2 import Environment, FileSystemLoader
 from gradio_app.backend.ChatGptInteractor import num_tokens_from_messages
+from gradio_app.backend.cross_encoder import rerank_with_cross_encoder
+from gradio_app.backend.query_llm import *
 from gradio_app.backend.semantic_search import table, embedder
 from settings import *
     return history, gr.Textbox(value="", interactive=False)
+def bot(history, llm, cross_enc):
     history[-1][1] = ""
     query = history[-1][0]
     if not query:
+        raise gr.Error("Empty string was submitted")
     logger.info('Retrieving documents...')
+    gr.Info('Start documents retrieval ...')
+    time = perf_counter()
     query_vec = embedder.embed(query)[0]
+    documents = table.search(query_vec, vector_column_name=VECTOR_COLUMN_NAME)
+    documents = documents.limit(TOP_K_RANK).to_list()
+    thresh_dist = thresh_distances[EMBED_NAME]
     thresh_dist = max(thresh_dist, min(d['_distance'] for d in documents))
     documents = [d for d in documents if d['_distance'] <= thresh_dist]
     documents = [doc[TEXT_COLUMN_NAME] for doc in documents]
+    time = perf_counter() - time
+    logger.info(f'Finished Retrieving documents in {round(time, 2)} seconds...')
+    logger.info('Reranking documents...')
+    gr.Info('Start documents reranking ...')
+    time = perf_counter()
+    documents = rerank_with_cross_encoder(cross_enc, documents, query)
+    time = perf_counter() - time
+    logger.info(f'Finished Reranking documents in {round(time, 2)} seconds...')
+    msg_constructor = get_message_constructor(llm)
     while len(documents) != 0:
         context = context_template.render(documents=documents)
         documents_html = [markdown.markdown(d) for d in documents]
         context_html = context_html_template.render(documents=documents_html)
+        messages = msg_constructor(context, history)
+        num_tokens = num_tokens_from_messages(messages, 'gpt-3.5-turbo')  # todo for HF, it is approximation
+        if num_tokens + 512 < context_lengths[llm]:
             break
         documents.pop()
     else:
         raise gr.Error('Model context length exceeded, reload the page')
+    llm_gen = get_llm_generator(llm)
+    for part in llm_gen(messages):
         history[-1][1] += part
         yield history, context_html
     else:
                 )
                 txt_btn = gr.Button(value="Submit text", scale=1)
+            llm_name = gr.Radio(
+                choices=[
+                    "gpt-3.5-turbo",
+                    "mistralai/Mistral-7B-Instruct-v0.1",
+                    "GeneZC/MiniChat-3B",
+                ],
+                value="gpt-3.5-turbo",
+                label='LLM'
+            )
+            cross_enc_name = gr.Radio(
+                choices=[
+                    None,
+                    "cross-encoder/ms-marco-TinyBERT-L-2-v2",
+                    "cross-encoder/ms-marco-MiniLM-L-12-v2",
+                ],
+                value=None,
+                label='Cross-Encoder'
+            )
             # Examples
             gr.Examples(examples, input_textbox)
     txt_msg = txt_btn.click(
         add_text, [chatbot, input_textbox], [chatbot, input_textbox], queue=False
     ).then(
+        bot, [chatbot, llm_name, cross_enc_name], [chatbot, context_html]
     )
     # Turn it back on
     # Turn off interactivity while generating if you hit enter
     txt_msg = input_textbox.submit(add_text, [chatbot, input_textbox], [chatbot, input_textbox], queue=False).then(
+        bot, [chatbot, llm_name, cross_enc_name], [chatbot, context_html])
     # Turn it back on
     txt_msg.then(lambda: gr.Textbox(interactive=True), None, [input_textbox], queue=False)

gradio_app/backend/ChatGptInteractor.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import time
 import tiktoken
@@ -9,6 +10,10 @@ with open('data/openaikey.txt') as f:
 openai.api_key = OPENAI_KEY
 def num_tokens_from_messages(messages, model):
     """
     Return the number of tokens used by a list of messages.
@@ -17,7 +22,7 @@ def num_tokens_from_messages(messages, model):
     try:
         encoding = tiktoken.encoding_for_model(model)
     except KeyError:
-        print("Warning: model not found. Using cl100k_base encoding.")
         encoding = tiktoken.get_encoding("cl100k_base")
     if model in {
         "gpt-3.5-turbo-0613",
@@ -33,10 +38,10 @@ def num_tokens_from_messages(messages, model):
         tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
         tokens_per_name = -1  # if there's a name, the role is omitted
     elif "gpt-3.5-turbo" in model:
-        # print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
         return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
     elif "gpt-4" in model:
-        # print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
         return num_tokens_from_messages(messages, model="gpt-4-0613")
     else:
         raise NotImplementedError(
@@ -54,8 +59,11 @@ def num_tokens_from_messages(messages, model):
 class ChatGptInteractor:
-    def __init__(self, model_name='gpt-3.5-turbo'):
         self.model_name = model_name
         self.tokenizer = tiktoken.encoding_for_model(self.model_name)
     def chat_completion_simple(
@@ -63,15 +71,9 @@ class ChatGptInteractor:
             *,
             user_text,
             system_text=None,
-            max_tokens=None,
-            temperature=None,
-            stream=False,
     ):
         return self.chat_completion(
             self._construct_messages_simple(user_text, system_text),
-            max_tokens=max_tokens,
-            temperature=temperature,
-            stream=stream,
         )
     def count_tokens_simple(self, *, user_text, system_text=None):
@@ -91,27 +93,17 @@ class ChatGptInteractor:
         })
         return messages
-    def chat_completion(
-            self,
-            messages,
-            max_tokens=None,
-            temperature=None,
-            stream=False,
-    ):
-        print(f'Sending request to {self.model_name} stream={stream} ...')
         t1 = time.time()
-        completion = self._request(
-            model=self.model_name,
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            stream=stream,
-        )
-        if stream:
-            return completion
         t2 = time.time()
         usage = completion['usage']
-        print(
             f'Received response: {usage["prompt_tokens"]} in + {usage["completion_tokens"]} out'
             f' = {usage["total_tokens"]} total tokens. Time: {t2 - t1:3.1f} seconds'
         )
@@ -121,14 +113,23 @@ class ChatGptInteractor:
     def get_stream_text(stream_part):
         return stream_part['choices'][0]['delta'].get('content', '')
     def count_tokens(self, messages):
         return num_tokens_from_messages(messages, self.model_name)
-    def _request(self, *args, **kwargs):
         for _ in range(5):
             try:
                 completion = openai.ChatCompletion.create(
-                    *args, **kwargs,
                     request_timeout=100.0,
                 )
                 return completion
@@ -164,7 +165,8 @@ if __name__ == '__main__':
     print(cgi.chat_completion_simple(user_text=ut, system_text=st))
     print('---')
-    for part in cgi.chat_completion_simple(user_text=ut, system_text=st, stream=True):
-        print(cgi.get_stream_text(part), end='')
     print('\n---')

+import logging
 import time
 import tiktoken
 openai.api_key = OPENAI_KEY
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 def num_tokens_from_messages(messages, model):
     """
     Return the number of tokens used by a list of messages.
     try:
         encoding = tiktoken.encoding_for_model(model)
     except KeyError:
+        logger.info("Warning: model not found. Using cl100k_base encoding.")
         encoding = tiktoken.get_encoding("cl100k_base")
     if model in {
         "gpt-3.5-turbo-0613",
         tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
         tokens_per_name = -1  # if there's a name, the role is omitted
     elif "gpt-3.5-turbo" in model:
+        # logger.info()("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
         return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
     elif "gpt-4" in model:
+        # logger.info()("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
         return num_tokens_from_messages(messages, model="gpt-4-0613")
     else:
         raise NotImplementedError(
 class ChatGptInteractor:
+    def __init__(self, model_name='gpt-3.5-turbo', max_tokens=None, temperature=None, stream=False):
         self.model_name = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.stream = stream
         self.tokenizer = tiktoken.encoding_for_model(self.model_name)
     def chat_completion_simple(
             *,
             user_text,
             system_text=None,
     ):
         return self.chat_completion(
             self._construct_messages_simple(user_text, system_text),
         )
     def count_tokens_simple(self, *, user_text, system_text=None):
         })
         return messages
+    def chat_completion(self, messages):
+        logger.info(f'Sending request to {self.model_name} stream={self.stream} ...')
         t1 = time.time()
+        completion = self._request(messages)
+        if self.stream:
+            return self._generator(completion)
         t2 = time.time()
         usage = completion['usage']
+        logger.info(
             f'Received response: {usage["prompt_tokens"]} in + {usage["completion_tokens"]} out'
             f' = {usage["total_tokens"]} total tokens. Time: {t2 - t1:3.1f} seconds'
         )
     def get_stream_text(stream_part):
         return stream_part['choices'][0]['delta'].get('content', '')
+    @staticmethod
+    def _generator(completion):
+        for part in completion:
+            yield ChatGptInteractor.get_stream_text(part)
     def count_tokens(self, messages):
         return num_tokens_from_messages(messages, self.model_name)
+    def _request(self, messages):
         for _ in range(5):
             try:
                 completion = openai.ChatCompletion.create(
+                    messages=messages,
+                    model=self.model_name,
+                    max_tokens=self.max_tokens,
+                    temperature=self.temperature,
+                    stream=self.stream,
                     request_timeout=100.0,
                 )
                 return completion
     print(cgi.chat_completion_simple(user_text=ut, system_text=st))
     print('---')
+    cgi = ChatGptInteractor(stream=True)
+    for part in cgi.chat_completion_simple(user_text=ut, system_text=st):
+        print(part, end='')
     print('\n---')

gradio_app/backend/HuggingfaceGenerator.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import logging
+from huggingface_hub import InferenceClient
+from transformers import AutoTokenizer
+with open('data/hftoken.txt') as f:
+    HF_TOKEN = f.read().strip()
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# noinspection PyTypeChecker
+class HuggingfaceGenerator:
+    def __init__(
+            self, model_name,
+            temperature: float = 0.9, max_new_tokens: int = 512,
+            top_p: float = None, repetition_penalty: float = None,
+            stream: bool = True,
+    ):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.hf_client = InferenceClient(model_name, token=HF_TOKEN)
+        self.stream = stream
+        self.generate_kwargs = {
+            'temperature': max(temperature, 0.1),
+            'max_new_tokens': max_new_tokens,
+            'top_p': top_p,
+            'repetition_penalty': repetition_penalty,
+            'do_sample': True,
+            'seed': 42,
+        }
+    def generate(self, messages):
+        formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False)
+        logger.info(f'Start HuggingFace generation, model {self.hf_client.model} ...')
+        stream = self.hf_client.text_generation(
+            formatted_prompt, **self.generate_kwargs,
+            stream=self.stream, details=True, return_full_text=not self.stream
+        )
+        for response in stream:
+            yield response.token.text

gradio_app/backend/cross_encoder.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from settings import *
+cross_encoder = None
+cross_enc_tokenizer = None
+@torch.no_grad()
+def rerank_with_cross_encoder(cross_enc_name, documents, query):
+    if cross_enc_name is None or len(documents) <= 1:
+        return documents
+    global cross_encoder, cross_enc_tokenizer
+    if cross_encoder is None or cross_encoder.name_or_path != cross_enc_name:
+        cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_enc_name)
+        cross_encoder.eval()
+        cross_enc_tokenizer = AutoTokenizer.from_pretrained(cross_enc_name)
+    features = cross_enc_tokenizer(
+        [query] * len(documents), documents, padding=True, truncation=True, return_tensors="pt"
+    )
+    scores = cross_encoder(**features).logits.squeeze()
+    ranks = torch.argsort(scores, descending=True)
+    documents = [documents[i] for i in ranks[:TOP_K_RERANK]]
+    return documents

gradio_app/backend/query_llm.py CHANGED Viewed

@@ -1,102 +1,30 @@
-import gradio as gr
-from typing import Any, Dict, Generator, List
-# from huggingface_hub import InferenceClient
-# from transformers import AutoTokenizer
 from jinja2 import Environment, FileSystemLoader
-from settings import *
 from gradio_app.backend.ChatGptInteractor import *
-# tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
-# HF_TOKEN = None
-# hf_client = InferenceClient(LLM_NAME, token=HF_TOKEN)
-def format_prompt(message: str, api_kind: str):
-    """
-    Formats the given message using a chat template.
-    Args:
-        message (str): The user message to be formatted.
-    Returns:
-        str: Formatted message after applying the chat template.
-    """
-    # Create a list of message dictionaries with role and content
-    messages: List[Dict[str, Any]] = [{'role': 'user', 'content': message}]
-    if api_kind == "openai":
-        return messages
-    elif api_kind == "hf":
-        return tokenizer.apply_chat_template(messages, tokenize=False)
-    else:
-        raise ValueError("API is not supported")
-def generate_hf(prompt: str, history: str, temperature: float = 0.9, max_new_tokens: int = 512,
-                top_p: float = 0.6, repetition_penalty: float = 1.2) -> Generator[str, None, str]:
-    """
-    Generate a sequence of tokens based on a given prompt and history using Mistral client.
-    Args:
-        prompt (str): The initial prompt for the text generation.
-        history (str): Context or history for the text generation.
-        temperature (float, optional): The softmax temperature for sampling. Defaults to 0.9.
-        max_new_tokens (int, optional): Maximum number of tokens to be generated. Defaults to 256.
-        top_p (float, optional): Nucleus sampling probability. Defaults to 0.95.
-        repetition_penalty (float, optional): Penalty for repeated tokens. Defaults to 1.0.
-    Returns:
-        Generator[str, None, str]: A generator yielding chunks of generated text.
-                                   Returns a final string if an error occurs.
-    """
-    temperature = max(float(temperature), 1e-2)  # Ensure temperature isn't too low
-    top_p = float(top_p)
-    generate_kwargs = {
-        'temperature': temperature,
-        'max_new_tokens': max_new_tokens,
-        'top_p': top_p,
-        'repetition_penalty': repetition_penalty,
-        'do_sample': True,
-        'seed': 42,
-    }
-    formatted_prompt = format_prompt(prompt, "hf")
-    try:
-        stream = hf_client.text_generation(formatted_prompt, **generate_kwargs,
-                                           stream=True, details=True, return_full_text=False)
-        output = ""
-        for response in stream:
-            output += response.token.text
-            yield output
-    except Exception as e:
-        if "Too Many Requests" in str(e):
-            print("ERROR: Too many requests on Mistral client")
-            gr.Warning("Unfortunately Mistral is unable to process")
-            return "Unfortunately, I am not able to process your request now."
-        elif "Authorization header is invalid" in str(e):
-            print("Authetification error:", str(e))
-            gr.Warning("Authentication error: HF token was either not provided or incorrect")
-            return "Authentication error"
-        else:
-            print("Unhandled Exception:", str(e))
-            gr.Warning("Unfortunately Mistral is unable to process")
-            return "I do not know what happened, but I couldn't understand you."
 env = Environment(loader=FileSystemLoader('gradio_app/templates'))
 context_template = env.get_template('context_template.j2')
 start_system_message = context_template.render(documents=[])
 def construct_openai_messages(context, history):
     messages = [
         {
@@ -122,64 +50,32 @@ def construct_openai_messages(context, history):
     return messages
-def generate_openai(messages):
-    cgi = ChatGptInteractor(model_name=LLM_NAME)
-    for part in cgi.chat_completion(messages, max_tokens=512, temperature=0, stream=True):
-        yield cgi.get_stream_text(part)
-def _generate_openai(prompt: str, history: str, temperature: float = 0.9, max_new_tokens: int = 512,
-                    top_p: float = 0.6, repetition_penalty: float = 1.2) -> Generator[str, None, str]:
-    """
-    Generate a sequence of tokens based on a given prompt and history using Mistral client.
-    Args:
-        prompt (str): The initial prompt for the text generation.
-        history (str): Context or history for the text generation.
-        temperature (float, optional): The softmax temperature for sampling. Defaults to 0.9.
-        max_new_tokens (int, optional): Maximum number of tokens to be generated. Defaults to 256.
-        top_p (float, optional): Nucleus sampling probability. Defaults to 0.95.
-        repetition_penalty (float, optional): Penalty for repeated tokens. Defaults to 1.0.
-    Returns:
-        Generator[str, None, str]: A generator yielding chunks of generated text.
-                                   Returns a final string if an error occurs.
-    """
-    temperature = max(float(temperature), 1e-2)  # Ensure temperature isn't too low
-    top_p = float(top_p)
-    generate_kwargs = {
-        'temperature': temperature,
-        'max_tokens': max_new_tokens,
-        'top_p': top_p,
-        'frequency_penalty': max(-2., min(repetition_penalty, 2.)),
-    }
-    formatted_prompt = format_prompt(prompt, "openai")
-    try:
-        stream = openai.ChatCompletion.create(
-            model=LLM_NAME,
-            messages=formatted_prompt,
-            **generate_kwargs,
-            stream=True
-        )
-        output = ""
-        for chunk in stream:
-            output += chunk.choices[0].delta.get("content", "")
-            yield output
-    except Exception as e:
-        if "Too Many Requests" in str(e):
-            print("ERROR: Too many requests on OpenAI client")
-            gr.Warning("Unfortunately OpenAI is unable to process")
-            return "Unfortunately, I am not able to process your request now."
-        elif "You didn't provide an API key" in str(e):
-            print("Authetification error:", str(e))
-            gr.Warning("Authentication error: OpenAI key was either not provided or incorrect")
-            return "Authentication error"
-        else:
-            print("Unhandled Exception:", str(e))
-            gr.Warning("Unfortunately OpenAI is unable to process")
-            return "I do not know what happened, but I couldn't understand you."

 from jinja2 import Environment, FileSystemLoader
 from gradio_app.backend.ChatGptInteractor import *
+from gradio_app.backend.HuggingfaceGenerator import HuggingfaceGenerator
 env = Environment(loader=FileSystemLoader('gradio_app/templates'))
 context_template = env.get_template('context_template.j2')
 start_system_message = context_template.render(documents=[])
+def construct_mistral_messages(context, history):
+    messages = []
+    for q, a in history:
+        if len(a) == 0:  # the last message
+            q = context + f'\n\nQuery:\n\n{q}'
+        messages.append({
+            "role": "user",
+            "content": q,
+        })
+        if len(a) != 0:  # some of the previous LLM answers
+            messages.append({
+                "role": "assistant",
+                "content": a,
+            })
+    return messages
 def construct_openai_messages(context, history):
     messages = [
         {
     return messages
+def get_message_constructor(llm_name):
+    if llm_name == 'gpt-3.5-turbo':
+        return construct_openai_messages
+    if llm_name in ['mistralai/Mistral-7B-Instruct-v0.1', "GeneZC/MiniChat-3B"]:
+        return construct_mistral_messages
+    raise ValueError('Unknown LLM name')
+def get_llm_generator(llm_name):
+    if llm_name == 'gpt-3.5-turbo':
+        cgi = ChatGptInteractor(
+            model_name=llm_name, max_tokens=512, temperature=0, stream=True
+        )
+        return cgi.chat_completion
+    if llm_name == 'mistralai/Mistral-7B-Instruct-v0.1':
+        hfg = HuggingfaceGenerator(
+            model_name=llm_name, temperature=0, max_new_tokens=512,
+        )
+        return hfg.generate
+    if llm_name == "GeneZC/MiniChat-3B":
+        hfg = HuggingfaceGenerator(
+            model_name=llm_name, temperature=0, max_new_tokens=250, stream=False,
+        )
+        return hfg.generate
+    raise ValueError('Unknown LLM name')

settings.py CHANGED Viewed

@@ -5,11 +5,11 @@ VECTOR_COLUMN_NAME = "embedding"
 TEXT_COLUMN_NAME = "text"
 DOCUMENT_PATH_COLUMN_NAME = "document_path"
-# LLM_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
-LLM_NAME = "gpt-3.5-turbo"
 # EMBED_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 EMBED_NAME = "text-embedding-ada-002"
 emb_sizes = {
     "sentence-transformers/all-MiniLM-L6-v2": 384,
@@ -17,8 +17,14 @@ emb_sizes = {
     "text-embedding-ada-002": 1536,
 }
 context_lengths = {
     "mistralai/Mistral-7B-Instruct-v0.1": 4096,
     "gpt-3.5-turbo": 4096,
     "sentence-transformers/all-MiniLM-L6-v2": 128,
     "thenlper/gte-large": 512,

 TEXT_COLUMN_NAME = "text"
 DOCUMENT_PATH_COLUMN_NAME = "document_path"
 # EMBED_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 EMBED_NAME = "text-embedding-ada-002"
+TOP_K_RANK = 50
+TOP_K_RERANK = 5
 emb_sizes = {
     "sentence-transformers/all-MiniLM-L6-v2": 384,
     "text-embedding-ada-002": 1536,
 }
+thresh_distances = {
+    "sentence-transformers/all-MiniLM-L6-v2": 1.2,
+    "text-embedding-ada-002": 0.5,
+}
 context_lengths = {
     "mistralai/Mistral-7B-Instruct-v0.1": 4096,
+    "GeneZC/MiniChat-3B": 4096,
     "gpt-3.5-turbo": 4096,
     "sentence-transformers/all-MiniLM-L6-v2": 128,
     "thenlper/gte-large": 512,