Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -70,7 +70,7 @@ def limit_tokens(input_string, token_limit=6000):
|
|
| 70 |
def calculate_tokens(msgs):
|
| 71 |
return sum(len(encoding.encode(str(m))) for m in msgs)
|
| 72 |
|
| 73 |
-
|
| 74 |
while calculate_tokens(messages) > (8000 - max_output_tokens):
|
| 75 |
if len(messages) > max_llm_history:
|
| 76 |
messages = [messages[0]] + messages[-max_llm_history:]
|
|
@@ -78,10 +78,10 @@ async def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_histor
|
|
| 78 |
max_llm_history -= 1
|
| 79 |
if max_llm_history < 2:
|
| 80 |
error_message = "Token limit exceeded. Please shorten your input or start a new conversation."
|
| 81 |
-
raise HTTPException(status_code=400, detail=error_message)
|
| 82 |
|
| 83 |
try:
|
| 84 |
-
response =
|
| 85 |
model=model,
|
| 86 |
messages=messages,
|
| 87 |
max_tokens=max_output_tokens,
|
|
@@ -89,7 +89,7 @@ async def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_histor
|
|
| 89 |
)
|
| 90 |
|
| 91 |
full_response = ""
|
| 92 |
-
|
| 93 |
if chunk.choices[0].delta.content is not None:
|
| 94 |
content = chunk.choices[0].delta.content
|
| 95 |
full_response += content
|
|
@@ -100,7 +100,6 @@ async def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_histor
|
|
| 100 |
except Exception as e:
|
| 101 |
raise HTTPException(status_code=500, detail=f"Error in model response: {str(e)}")
|
| 102 |
|
| 103 |
-
|
| 104 |
async def verify_api_key(api_key: str = Security(api_key_header)):
|
| 105 |
if api_key != API_KEY:
|
| 106 |
raise HTTPException(status_code=403, detail="Could not validate credentials")
|
|
@@ -176,9 +175,9 @@ async def coding_assistant(query: QueryModel, background_tasks: BackgroundTasks,
|
|
| 176 |
# Limit tokens in the conversation history
|
| 177 |
limited_conversation = conversations[query.conversation_id]
|
| 178 |
|
| 179 |
-
|
| 180 |
full_response = ""
|
| 181 |
-
|
| 182 |
full_response += content
|
| 183 |
yield content
|
| 184 |
background_tasks.add_task(update_db, query.user_id, query.conversation_id, query.user_query, full_response)
|
|
|
|
| 70 |
def calculate_tokens(msgs):
|
| 71 |
return sum(len(encoding.encode(str(m))) for m in msgs)
|
| 72 |
|
| 73 |
+
def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_history=4, max_output_tokens=2500):
|
| 74 |
while calculate_tokens(messages) > (8000 - max_output_tokens):
|
| 75 |
if len(messages) > max_llm_history:
|
| 76 |
messages = [messages[0]] + messages[-max_llm_history:]
|
|
|
|
| 78 |
max_llm_history -= 1
|
| 79 |
if max_llm_history < 2:
|
| 80 |
error_message = "Token limit exceeded. Please shorten your input or start a new conversation."
|
| 81 |
+
raise HTTPException(status_code=400, detail=error_message)
|
| 82 |
|
| 83 |
try:
|
| 84 |
+
response = or_client.chat.completions.create(
|
| 85 |
model=model,
|
| 86 |
messages=messages,
|
| 87 |
max_tokens=max_output_tokens,
|
|
|
|
| 89 |
)
|
| 90 |
|
| 91 |
full_response = ""
|
| 92 |
+
for chunk in response:
|
| 93 |
if chunk.choices[0].delta.content is not None:
|
| 94 |
content = chunk.choices[0].delta.content
|
| 95 |
full_response += content
|
|
|
|
| 100 |
except Exception as e:
|
| 101 |
raise HTTPException(status_code=500, detail=f"Error in model response: {str(e)}")
|
| 102 |
|
|
|
|
| 103 |
async def verify_api_key(api_key: str = Security(api_key_header)):
|
| 104 |
if api_key != API_KEY:
|
| 105 |
raise HTTPException(status_code=403, detail="Could not validate credentials")
|
|
|
|
| 175 |
# Limit tokens in the conversation history
|
| 176 |
limited_conversation = conversations[query.conversation_id]
|
| 177 |
|
| 178 |
+
def process_response():
|
| 179 |
full_response = ""
|
| 180 |
+
for content in chat_with_llama_stream(limited_conversation, model=query.model_id):
|
| 181 |
full_response += content
|
| 182 |
yield content
|
| 183 |
background_tasks.add_task(update_db, query.user_id, query.conversation_id, query.user_query, full_response)
|