ZeroGPU-LLM-Inference

Sleeping

App Files Files Community

Luigi commited on Apr 11

Commit

eb215ff

1 Parent(s): 35943b1

UI/UX Improvement

Browse files

Files changed (1) hide show

app.py +161 -110

app.py CHANGED Viewed

@@ -6,7 +6,9 @@ from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
 from huggingface_hub import hf_hub_download
 from duckduckgo_search import DDGS
-# ---- Initialize session state ----
 if "chat_history" not in st.session_state:
     st.session_state.chat_history = []
 if "pending_response" not in st.session_state:
@@ -16,34 +18,24 @@ if "model_name" not in st.session_state:
 if "llm" not in st.session_state:
     st.session_state.llm = None
-# ---- Custom CSS ----
 st.markdown("""
 <style>
-ul.think-list { margin: 0.5em 0 1em 1.5em; padding: 0; list-style-type: disc; }
-ul.think-list li { margin-bottom: 0.5em; }
-.chat-assistant { background-color: #f9f9f9; padding: 1em; border-radius: 5px; margin-bottom: 1em; }
 </style>
 """, unsafe_allow_html=True)
-# ---- Required storage space ----
 REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
-# ---- Function to retrieve web search context ----
-def retrieve_context(query, max_results=6, max_chars_per_result=600):
-    try:
-        with DDGS() as ddgs:
-            results = list(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))
-            context = ""
-            for i, result in enumerate(results, start=1):
-                title = result.get("title", "No Title")
-                snippet = result.get("body", "")[:max_chars_per_result]
-                context += f"Result {i}:\nTitle: {title}\nSnippet: {snippet}\n\n"
-            return context.strip()
-    except Exception as e:
-        st.error(f"Error during retrieval: {e}")
-        return ""
-# ---- Model definitions ----
 MODELS = {
     "Qwen2.5-0.5B-Instruct (Q4_K_M)": {
         "repo_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
@@ -102,33 +94,30 @@ MODELS = {
     },
 }
-# ----- Sidebar settings -----
-# ----- Sidebar settings -----
-with st.sidebar:
-    st.header("⚙️ Settings")
-    selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
-    system_prompt_base = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
-    max_tokens = st.slider("Max tokens", 64, 1024, 256, step=32)
-    temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
-    top_k = st.slider("Top-K", 1, 100, 40)
-    top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
-    repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
-    enable_search = st.checkbox("Enable Web Search", value=False)
-    # NEW SETTINGS: Expose search configuration
-    max_results = st.number_input("Max Results for Context", min_value=1, max_value=20, value=6, step=1)
-    max_chars_per_result = st.number_input("Max Chars Per Result", min_value=100, max_value=2000, value=600, step=50)
-# ---- Define selected model and manage its download/load ----
-selected_model = MODELS[selected_model_name]
-model_path = os.path.join("models", selected_model["filename"])
-os.makedirs("models", exist_ok=True)
-def try_load_model(path):
     try:
         return Llama(
-            model_path=path,
-            n_ctx=4096,  # Reduced context window
             n_threads=2,
             n_threads_batch=1,
             n_batch=256,
@@ -142,7 +131,8 @@ def try_load_model(path):
     except Exception as e:
         return str(e)
-def download_model():
     with st.spinner(f"Downloading {selected_model['filename']}..."):
         hf_hub_download(
             repo_id=selected_model["repo_id"],
@@ -151,63 +141,142 @@ def download_model():
             local_dir_use_symlinks=False,
         )
-def validate_or_download_model():
     if not os.path.exists(model_path):
         if shutil.disk_usage(".").free < REQUIRED_SPACE_BYTES:
-            st.info("Insufficient storage. Consider cleaning up old models.")
-        download_model()
     result = try_load_model(model_path)
     if isinstance(result, str):
-        st.warning(f"Initial load failed: {result}\nRe-downloading...")
         try:
             os.remove(model_path)
         except Exception:
             pass
-        download_model()
         result = try_load_model(model_path)
         if isinstance(result, str):
-            st.error(f"Model still failed after re-download: {result}")
             st.stop()
     return result
 if st.session_state.model_name != selected_model_name:
-    if st.session_state.llm is not None:
-        del st.session_state.llm
-        gc.collect()
-    st.session_state.llm = validate_or_download_model()
-    st.session_state.model_name = selected_model_name
 llm = st.session_state.llm
-# ---- Display title and existing chat history ----
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
 for chat in st.session_state.chat_history:
-    with st.chat_message(chat["role"]):
-        st.markdown(chat["content"])
-# ---- Chat input and processing ----
-user_input = st.chat_input("Ask something...")
 if user_input:
     if st.session_state.pending_response:
-        st.warning("Please wait for the assistant to finish responding.")
     else:
-        # Display user input and update chat history
         with st.chat_message("user"):
-            st.markdown(user_input)
-        st.session_state.chat_history.append({"role": "user", "content": user_input})
         st.session_state.pending_response = True
-        # Use the new settings when retrieving web search context
-        retrieved_context = (
-            retrieve_context(user_input, max_results=max_results, max_chars_per_result=max_chars_per_result)
-            if enable_search else ""
-        )
-        st.sidebar.markdown("### Retrieved Context" if enable_search else "Web Search Disabled")
-        st.sidebar.text(retrieved_context or "No context found.")
-        # Build augmented query as before...
         if enable_search and retrieved_context:
             augmented_user_input = (
                 f"{system_prompt_base.strip()}\n\n"
@@ -217,8 +286,8 @@ if user_input:
             )
         else:
             augmented_user_input = f"{system_prompt_base.strip()}\n\nUser Query: {user_input}"
-        # Limit conversation history (last 2 pairs)
         MAX_TURNS = 2
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
         if trimmed_history and trimmed_history[-1]["role"] == "user":
@@ -226,62 +295,44 @@ if user_input:
         else:
             messages = trimmed_history + [{"role": "user", "content": augmented_user_input}]
-        # ---- Set up a placeholder for the response and queue for streaming tokens ----
         visible_placeholder = st.empty()
         response_queue = queue.Queue()
-        # Function to stream LLM response and push incremental updates into the queue
-        def stream_response(msgs, max_tokens, temp, topk, topp, repeat_penalty):
-            final_text = ""
-            try:
-                stream = llm.create_chat_completion(
-                    messages=msgs,
-                    max_tokens=max_tokens,
-                    temperature=temp,
-                    top_k=topk,
-                    top_p=topp,
-                    repeat_penalty=repeat_penalty,
-                    stream=True,
-                )
-                for chunk in stream:
-                    if "choices" in chunk:
-                        delta = chunk["choices"][0]["delta"].get("content", "")
-                        final_text += delta
-                        response_queue.put(delta)
-                        if chunk["choices"][0].get("finish_reason", ""):
-                            break
-            except Exception as e:
-                response_queue.put(f"\nError: {e}")
-            response_queue.put(None)  # Signal completion
-        # Start streaming in a separate thread
         stream_thread = threading.Thread(
             target=stream_response,
-            args=(messages, max_tokens, temperature, top_k, top_p, repeat_penalty),
             daemon=True
         )
         stream_thread.start()
-        # Poll the queue in the main thread for up to 5 seconds
         final_response = ""
         timeout = 300  # seconds
         start_time = time.time()
         while True:
             try:
                 update = response_queue.get(timeout=0.1)
                 if update is None:
                     break
                 final_response += update
                 visible_response = re.sub(r"<think>.*?</think>", "", final_response, flags=re.DOTALL)
-                visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
-                visible_placeholder.markdown(visible_response)
-                # Reset the timer after each token is generated
                 start_time = time.time()
             except queue.Empty:
                 if time.time() - start_time > timeout:
                     st.error("Response generation timed out.")
                     break
-        st.session_state.chat_history.append({"role": "assistant", "content": final_response})
         st.session_state.pending_response = False
         gc.collect()

 from huggingface_hub import hf_hub_download
 from duckduckgo_search import DDGS
+# ------------------------------
+# Initialize Session State
+# ------------------------------
 if "chat_history" not in st.session_state:
     st.session_state.chat_history = []
 if "pending_response" not in st.session_state:
 if "llm" not in st.session_state:
     st.session_state.llm = None
+# ------------------------------
+# Custom CSS for Improved Look & Feel
+# ------------------------------
 st.markdown("""
 <style>
+    .chat-container { margin: 1em 0; }
+    .chat-assistant { background-color: #eef7ff; padding: 1em; border-radius: 10px; margin-bottom: 1em; }
+    .chat-user { background-color: #e6ffe6; padding: 1em; border-radius: 10px; margin-bottom: 1em; }
+    .message-time { font-size: 0.8em; color: #555; text-align: right; }
+    .loading-spinner { font-size: 1.1em; color: #ff6600; }
 </style>
 """, unsafe_allow_html=True)
+# ------------------------------
+# Required Storage and Model Definitions
+# ------------------------------
 REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
 MODELS = {
     "Qwen2.5-0.5B-Instruct (Q4_K_M)": {
         "repo_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
     },
 }
+# ------------------------------
+# Helper Functions
+# ------------------------------
+def retrieve_context(query, max_results=6, max_chars_per_result=600):
+    """Retrieve web search context using DuckDuckGo."""
+    try:
+        with DDGS() as ddgs:
+            results = list(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))
+            context = ""
+            for i, result in enumerate(results, start=1):
+                title = result.get("title", "No Title")
+                snippet = result.get("body", "")[:max_chars_per_result]
+                context += f"Result {i}:\nTitle: {title}\nSnippet: {snippet}\n\n"
+            return context.strip()
+    except Exception as e:
+        st.error(f"Error during web retrieval: {e}")
+        return ""
+def try_load_model(model_path):
+    """Attempt to initialize the model from a specified path."""
     try:
         return Llama(
+            model_path=model_path,
+            n_ctx=4096,
             n_threads=2,
             n_threads_batch=1,
             n_batch=256,
     except Exception as e:
         return str(e)
+def download_model(selected_model):
+    """Download the model using Hugging Face Hub."""
     with st.spinner(f"Downloading {selected_model['filename']}..."):
         hf_hub_download(
             repo_id=selected_model["repo_id"],
             local_dir_use_symlinks=False,
         )
+def validate_or_download_model(selected_model):
+    """Ensure the model is available and loaded properly; download if necessary."""
+    model_path = os.path.join("models", selected_model["filename"])
+    os.makedirs("models", exist_ok=True)
     if not os.path.exists(model_path):
         if shutil.disk_usage(".").free < REQUIRED_SPACE_BYTES:
+            st.info("Insufficient storage space. Consider cleaning up old models.")
+        download_model(selected_model)
     result = try_load_model(model_path)
     if isinstance(result, str):
+        st.warning(f"Initial model load failed: {result}\nAttempting re-download...")
         try:
             os.remove(model_path)
         except Exception:
             pass
+        download_model(selected_model)
         result = try_load_model(model_path)
         if isinstance(result, str):
+            st.error(f"Model failed to load after re-download: {result}")
             st.stop()
     return result
+def stream_response(llm, messages, max_tokens, temperature, top_k, top_p, repeat_penalty, response_queue):
+    """Stream the model response token-by-token."""
+    final_text = ""
+    try:
+        stream = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repeat_penalty=repeat_penalty,
+            stream=True,
+        )
+        for chunk in stream:
+            if "choices" in chunk:
+                delta = chunk["choices"][0]["delta"].get("content", "")
+                final_text += delta
+                response_queue.put(delta)
+                if chunk["choices"][0].get("finish_reason", ""):
+                    break
+    except Exception as e:
+        response_queue.put(f"\nError: {e}")
+    response_queue.put(None)  # Signal the end of streaming
+# ------------------------------
+# Sidebar: Settings and Advanced Options
+# ------------------------------
+with st.sidebar:
+    st.header("⚙️ Settings")
+    # Basic Settings
+    selected_model_name = st.selectbox("Select Model", list(MODELS.keys()),
+                                       help="Choose from the available model configurations.")
+    system_prompt_base = st.text_area("System Prompt",
+                                       value="You are a helpful assistant.",
+                                       height=80,
+                                       help="Define the base context for the AI's responses.")
+    # Generation Parameters
+    st.subheader("Generation Parameters")
+    max_tokens = st.slider("Max Tokens", 64, 1024, 256, step=32,
+                           help="The maximum number of tokens the assistant can generate.")
+    temperature = st.slider("Temperature", 0.1, 2.0, 0.7,
+                            help="Controls randomness. Lower values are more deterministic.")
+    top_k = st.slider("Top-K", 1, 100, 40,
+                      help="Limits the token candidates to the top-k tokens.")
+    top_p = st.slider("Top-P", 0.1, 1.0, 0.95,
+                      help="Nucleus sampling parameter; restricts to a cumulative probability.")
+    repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1,
+                               help="Penalizes token repetition to improve output variety.")
+    # Advanced Settings using expandable sections
+    with st.expander("Web Search Settings"):
+        enable_search = st.checkbox("Enable Web Search", value=False,
+                                    help="Include recent web search context to augment the prompt.")
+        max_results = st.number_input("Max Results for Context", min_value=1, max_value=20, value=6, step=1,
+                                      help="How many search results to use.")
+        max_chars_per_result = st.number_input("Max Chars per Result", min_value=100, max_value=2000, value=600, step=50,
+                                               help="Max characters to extract from each search result.")
+# ------------------------------
+# Model Loading/Reloading if Needed
+# ------------------------------
+selected_model = MODELS[selected_model_name]
 if st.session_state.model_name != selected_model_name:
+    with st.spinner("Loading selected model..."):
+        if st.session_state.llm is not None:
+            del st.session_state.llm
+            gc.collect()
+        st.session_state.llm = validate_or_download_model(selected_model)
+        st.session_state.model_name = selected_model_name
 llm = st.session_state.llm
+# ------------------------------
+# Main Title and Chat History Display
+# ------------------------------
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
+# Render chat history with improved styling
 for chat in st.session_state.chat_history:
+    role = chat["role"]
+    content = chat["content"]
+    if role == "assistant":
+        st.markdown(f"<div class='chat-assistant'>{content}</div>", unsafe_allow_html=True)
+    else:
+        st.markdown(f"<div class='chat-user'>{content}</div>", unsafe_allow_html=True)
+# ------------------------------
+# Chat Input and Processing
+# ------------------------------
+user_input = st.chat_input("Your message...")
 if user_input:
     if st.session_state.pending_response:
+        st.warning("Please wait until the current response is finished.")
     else:
+        # Append user message with timestamp (if desired)
+        timestamp = time.strftime("%H:%M")
+        st.session_state.chat_history.append({"role": "user", "content": f"{user_input}\n\n<span class='message-time'>{timestamp}</span>"})
         with st.chat_message("user"):
+            st.markdown(f"<div class='chat-user'>{user_input}</div>", unsafe_allow_html=True)
         st.session_state.pending_response = True
+        # Retrieve web search context if enabled
+        retrieved_context = ""
+        if enable_search:
+            retrieved_context = retrieve_context(user_input, max_results=max_results, max_chars_per_result=max_chars_per_result)
+            with st.sidebar:
+                st.markdown("### Retrieved Context")
+                st.text_area("", value=retrieved_context or "No context found.", height=150)
+        # Augment the user prompt with the system prompt and optional web context
         if enable_search and retrieved_context:
             augmented_user_input = (
                 f"{system_prompt_base.strip()}\n\n"
             )
         else:
             augmented_user_input = f"{system_prompt_base.strip()}\n\nUser Query: {user_input}"
+        # Limit conversation history to the last few turns (for context)
         MAX_TURNS = 2
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
         if trimmed_history and trimmed_history[-1]["role"] == "user":
         else:
             messages = trimmed_history + [{"role": "user", "content": augmented_user_input}]
+        # Set up a placeholder for displaying the streaming response and a queue for tokens
         visible_placeholder = st.empty()
+        progress_bar = st.progress(0)
         response_queue = queue.Queue()
+        # Start streaming response in a separate thread
         stream_thread = threading.Thread(
             target=stream_response,
+            args=(llm, messages, max_tokens, temperature, top_k, top_p, repeat_penalty, response_queue),
             daemon=True
         )
         stream_thread.start()
+        # Poll the queue to update the UI with incremental tokens and update progress
         final_response = ""
         timeout = 300  # seconds
         start_time = time.time()
+        progress = 0
         while True:
             try:
                 update = response_queue.get(timeout=0.1)
                 if update is None:
                     break
                 final_response += update
+                # Remove any special tags from the output (for cleaner UI)
                 visible_response = re.sub(r"<think>.*?</think>", "", final_response, flags=re.DOTALL)
+                visible_placeholder.markdown(f"<div class='chat-assistant'>{visible_response}</div>", unsafe_allow_html=True)
+                progress = min(progress + 1, 100)
+                progress_bar.progress(progress)
                 start_time = time.time()
             except queue.Empty:
                 if time.time() - start_time > timeout:
                     st.error("Response generation timed out.")
                     break
+        # Append assistant response with timestamp
+        timestamp = time.strftime("%H:%M")
+        st.session_state.chat_history.append({"role": "assistant", "content": f"{final_response}\n\n<span class='message-time'>{timestamp}</span>"})
         st.session_state.pending_response = False
+        progress_bar.empty()  # Clear progress bar
         gc.collect()