web-server

Sleeping

App Files Files Community

pvanand commited on Jul 15, 2024

Commit

f8ac6db

verified ·

1 Parent(s): bc4a455

Update main.py

Browse files

Files changed (1) hide show

main.py +6 -7

main.py CHANGED Viewed

@@ -70,7 +70,7 @@ def limit_tokens(input_string, token_limit=6000):
 def calculate_tokens(msgs):
     return sum(len(encoding.encode(str(m))) for m in msgs)
-async def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_history=4, max_output_tokens=2500):
     while calculate_tokens(messages) > (8000 - max_output_tokens):
         if len(messages) > max_llm_history:
             messages = [messages[0]] + messages[-max_llm_history:]
@@ -78,10 +78,10 @@ async def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_histor
             max_llm_history -= 1
             if max_llm_history < 2:
                 error_message = "Token limit exceeded. Please shorten your input or start a new conversation."
-                raise HTTPException(status_code=400, detail=error_message)
     try:
-        response = await or_client.chat.completions.create(
             model=model,
             messages=messages,
             max_tokens=max_output_tokens,
@@ -89,7 +89,7 @@ async def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_histor
         )
         full_response = ""
-        async for chunk in response:
             if chunk.choices[0].delta.content is not None:
                 content = chunk.choices[0].delta.content
                 full_response += content
@@ -100,7 +100,6 @@ async def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_histor
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error in model response: {str(e)}")
 async def verify_api_key(api_key: str = Security(api_key_header)):
     if api_key != API_KEY:
         raise HTTPException(status_code=403, detail="Could not validate credentials")
@@ -176,9 +175,9 @@ async def coding_assistant(query: QueryModel, background_tasks: BackgroundTasks,
     # Limit tokens in the conversation history
     limited_conversation = conversations[query.conversation_id]
-    async def process_response():
         full_response = ""
-        async for content in chat_with_llama_stream(limited_conversation, model=query.model_id):
             full_response += content
             yield content
         background_tasks.add_task(update_db, query.user_id, query.conversation_id, query.user_query, full_response)

 def calculate_tokens(msgs):
     return sum(len(encoding.encode(str(m))) for m in msgs)
+def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_history=4, max_output_tokens=2500):
     while calculate_tokens(messages) > (8000 - max_output_tokens):
         if len(messages) > max_llm_history:
             messages = [messages[0]] + messages[-max_llm_history:]
             max_llm_history -= 1
             if max_llm_history < 2:
                 error_message = "Token limit exceeded. Please shorten your input or start a new conversation."
+                raise HTTPException(status_code=400, detail=error_message)
     try:
+        response = or_client.chat.completions.create(
             model=model,
             messages=messages,
             max_tokens=max_output_tokens,
         )
         full_response = ""
+        for chunk in response:
             if chunk.choices[0].delta.content is not None:
                 content = chunk.choices[0].delta.content
                 full_response += content
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error in model response: {str(e)}")
 async def verify_api_key(api_key: str = Security(api_key_header)):
     if api_key != API_KEY:
         raise HTTPException(status_code=403, detail="Could not validate credentials")
     # Limit tokens in the conversation history
     limited_conversation = conversations[query.conversation_id]
+    def process_response():
         full_response = ""
+        for content in chat_with_llama_stream(limited_conversation, model=query.model_id):
             full_response += content
             yield content
         background_tasks.add_task(update_db, query.user_id, query.conversation_id, query.user_query, full_response)