web-server

Sleeping

App Files Files Community

pvanand commited on Jul 15, 2024

Commit

68394ea

verified ·

1 Parent(s): b067e10

Update main.py

Browse files

Files changed (1) hide show

main.py +110 -17

main.py CHANGED Viewed

@@ -1,16 +1,22 @@
-from fastapi import FastAPI, HTTPException, Depends, Security
 from fastapi.security import APIKeyHeader
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
-from typing import Literal
 import os
 from functools import lru_cache
 from openai import OpenAI
 app = FastAPI()
 API_KEY_NAME = "X-API-Key"
-API_KEY = os.environ.get("API_KEY", "default_secret_key")  # Set this in your environment variables
 api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
 ModelID = Literal[
@@ -29,12 +35,16 @@ class QueryModel(BaseModel):
         default="meta-llama/llama-3-70b-instruct",
         description="ID of the model to use for response generation"
     )
     class Config:
         schema_extra = {
             "example": {
                 "user_query": "How do I implement a binary search in Python?",
-                "model_id": "meta-llama/llama-3-70b-instruct"
             }
         }
@@ -47,7 +57,28 @@ def get_api_keys():
 api_keys = get_api_keys()
 or_client = OpenAI(api_key=api_keys["OPENROUTER_API_KEY"], base_url="https://openrouter.ai/api/v1")
-def chat_with_llama_stream(messages, model, max_output_tokens=2500):
     try:
         response = or_client.chat.completions.create(
             model=model,
@@ -56,9 +87,16 @@ def chat_with_llama_stream(messages, model, max_output_tokens=2500):
             stream=True
         )
         for chunk in response:
             if chunk.choices[0].delta.content is not None:
-                yield chunk.choices[0].delta.content
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error in model response: {str(e)}")
@@ -67,8 +105,48 @@ async def verify_api_key(api_key: str = Security(api_key_header)):
         raise HTTPException(status_code=403, detail="Could not validate credentials")
     return api_key
 @app.post("/coding-assistant")
-async def coding_assistant(query: QueryModel, api_key: str = Depends(verify_api_key)):
     """
     Coding assistant endpoint that provides programming help based on user queries.
@@ -83,16 +161,31 @@ async def coding_assistant(query: QueryModel, api_key: str = Depends(verify_api_
     Requires API Key authentication via X-API-Key header.
     """
-    system_prompt = "You are a helpful assistant proficient in coding tasks. Help the user in understanding and writing code."
-    messages = [
-        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": query.user_query}
-    ]
-    return StreamingResponse(
-        chat_with_llama_stream(messages, model=query.model_id),
-        media_type="text/event-stream"
-    )
 if __name__ == "__main__":
     import uvicorn

+from fastapi import FastAPI, HTTPException, Depends, Security, BackgroundTasks
 from fastapi.security import APIKeyHeader
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
+from typing import Literal, List, Dict
 import os
 from functools import lru_cache
 from openai import OpenAI
+from uuid import uuid4
+import tiktoken
+import sqlite3
+import time
+from datetime import datetime, timedelta
+import asyncio
 app = FastAPI()
 API_KEY_NAME = "X-API-Key"
+API_KEY = os.environ.get("API_KEY", "default_secret_key")
 api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
 ModelID = Literal[
         default="meta-llama/llama-3-70b-instruct",
         description="ID of the model to use for response generation"
     )
+    conversation_id: str = Field(default_factory=lambda: str(uuid4()), description="Unique identifier for the conversation")
+    user_id: str = Field(..., description="Unique identifier for the user")
     class Config:
         schema_extra = {
             "example": {
                 "user_query": "How do I implement a binary search in Python?",
+                "model_id": "meta-llama/llama-3-70b-instruct",
+                "conversation_id": "123e4567-e89b-12d3-a456-426614174000",
+                "user_id": "user123"
             }
         }
 api_keys = get_api_keys()
 or_client = OpenAI(api_key=api_keys["OPENROUTER_API_KEY"], base_url="https://openrouter.ai/api/v1")
+# In-memory storage for conversations
+conversations: Dict[str, List[Dict[str, str]]] = {}
+last_activity: Dict[str, float] = {}
+# Token encoding
+encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
+def limit_tokens(input_string, token_limit=6000):
+    return encoding.decode(encoding.encode(input_string)[:token_limit])
+def calculate_tokens(msgs):
+    return sum(len(encoding.encode(str(m))) for m in msgs)
+def chat_with_llama_stream(messages, model="gpt-3.5-turbo", max_llm_history=4, max_output_tokens=2500):
+    while calculate_tokens(messages) > (8000 - max_output_tokens):
+        if len(messages) > max_llm_history:
+            messages = [messages[0]] + messages[-max_llm_history:]
+        else:
+            max_llm_history -= 1
+            if max_llm_history < 2:
+                raise ValueError("Unable to reduce message length below token limit")
     try:
         response = or_client.chat.completions.create(
             model=model,
             stream=True
         )
+        full_response = ""
         for chunk in response:
             if chunk.choices[0].delta.content is not None:
+                content = chunk.choices[0].delta.content
+                full_response += content
+                yield content
+        # After streaming, add the full response to the conversation history
+        messages.append({"role": "assistant", "content": full_response})
+        return full_response
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error in model response: {str(e)}")
         raise HTTPException(status_code=403, detail="Could not validate credentials")
     return api_key
+# SQLite setup
+def init_db():
+    conn = sqlite3.connect('conversations.db')
+    c = conn.cursor()
+    c.execute('''CREATE TABLE IF NOT EXISTS conversations
+                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
+                  user_id TEXT,
+                  conversation_id TEXT,
+                  message TEXT,
+                  response TEXT,
+                  timestamp DATETIME DEFAULT CURRENT_TIMESTAMP)''')
+    conn.commit()
+    conn.close()
+init_db()
+def update_db(user_id, conversation_id, message, response):
+    conn = sqlite3.connect('conversations.db')
+    c = conn.cursor()
+    c.execute('''INSERT INTO conversations (user_id, conversation_id, message, response)
+                 VALUES (?, ?, ?, ?)''', (user_id, conversation_id, message, response))
+    conn.commit()
+    conn.close()
+async def clear_inactive_conversations():
+    while True:
+        current_time = time.time()
+        inactive_convos = [conv_id for conv_id, last_time in last_activity.items()
+                           if current_time - last_time > 1800]  # 30 minutes
+        for conv_id in inactive_convos:
+            if conv_id in conversations:
+                del conversations[conv_id]
+            if conv_id in last_activity:
+                del last_activity[conv_id]
+        await asyncio.sleep(60)  # Check every minute
+@app.on_event("startup")
+async def startup_event():
+    asyncio.create_task(clear_inactive_conversations())
 @app.post("/coding-assistant")
+async def coding_assistant(query: QueryModel, background_tasks: BackgroundTasks, api_key: str = Depends(verify_api_key)):
     """
     Coding assistant endpoint that provides programming help based on user queries.
     Requires API Key authentication via X-API-Key header.
     """
+    if query.conversation_id not in conversations:
+        conversations[query.conversation_id] = [
+            {"role": "system", "content": "You are a helpful assistant proficient in coding tasks. Help the user in understanding and writing code."}
+        ]
+    conversations[query.conversation_id].append({"role": "user", "content": query.user_query})
+    last_activity[query.conversation_id] = time.time()
+    # Limit tokens in the conversation history
+    limited_conversation = conversations[query.conversation_id]
+    while calculate_tokens(limited_conversation) > 8000:
+        if len(limited_conversation) > 2:  # Keep at least the system message and the latest user message
+            limited_conversation.pop(1)
+        else:
+            error_message = "Token limit exceeded. Please shorten your input or start a new conversation."
+            raise HTTPException(status_code=400, detail=error_message)
+    async def process_response():
+        full_response = ""
+        async for content in chat_with_llama_stream(limited_conversation, model=query.model_id):
+            full_response += content
+            yield content
+        background_tasks.add_task(update_db, query.user_id, query.conversation_id, query.user_query, full_response)
+    return StreamingResponse(process_response(), media_type="text/event-stream")
 if __name__ == "__main__":
     import uvicorn