ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Apr 10

Commit

4522453

1 Parent(s): 6c77ec7

bugfix

Browse files

Files changed (1) hide show

app.py +80 -21

app.py CHANGED Viewed

@@ -6,6 +6,10 @@ import gc
 import shutil
 import re
 # Available models
 MODELS = {
     "Qwen2.5-7B-Instruct (Q2_K)": {
@@ -68,8 +72,8 @@ with st.sidebar:
     if st.button("📦 Show Disk Usage"):
         try:
             usage = shutil.disk_usage(".")
-            used = usage.used / (1024**3)
-            free = usage.free / (1024**3)
             st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
         except Exception as e:
             st.error(f"Disk usage error: {e}")
@@ -78,11 +82,15 @@ with st.sidebar:
 selected_model = MODELS[selected_model_name]
 model_path = os.path.join("models", selected_model["filename"])
-# Init state
 if "model_name" not in st.session_state:
     st.session_state.model_name = None
 if "llm" not in st.session_state:
     st.session_state.llm = None
 # Ensure model directory exists
 os.makedirs("models", exist_ok=True)
@@ -107,13 +115,28 @@ def download_model():
 def try_load_model(path):
     try:
-        return Llama(model_path=path, n_ctx=1024, n_threads=2, n_threads_batch=2, n_batch=4, n_gpu_layers=0, use_mlock=False, use_mmap=True, verbose=False)
     except Exception as e:
         return str(e)
 def validate_or_download_model():
     if not os.path.exists(model_path):
-        cleanup_old_models()
         download_model()
     result = try_load_model(model_path)
@@ -121,9 +144,13 @@ def validate_or_download_model():
         st.warning(f"Initial load failed: {result}\nAttempting re-download...")
         try:
             os.remove(model_path)
-        except:
             pass
-        cleanup_old_models()
         download_model()
         result = try_load_model(model_path)
         if isinstance(result, str):
@@ -142,29 +169,46 @@ if st.session_state.model_name != selected_model_name:
 llm = st.session_state.llm
-# Chat history state
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
 user_input = st.chat_input("Ask something...")
 if user_input:
-    if st.session_state.chat_history and st.session_state.chat_history[-1]["role"] == "user":
-        st.warning("Please wait for the assistant to respond before sending another message.")
     else:
         st.session_state.chat_history.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
         MAX_TURNS = 8
-        trimmed_history = st.session_state.chat_history[-MAX_TURNS * 2:]
         messages = [{"role": "system", "content": system_prompt}] + trimmed_history
         with st.chat_message("assistant"):
             full_response = ""
             stream = llm.create_chat_completion(
                 messages=messages,
@@ -175,19 +219,34 @@ if user_input:
                 repeat_penalty=repeat_penalty,
                 stream=True,
             )
             for chunk in stream:
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
                     full_response += delta
             visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
-            st.markdown(visible_response)
-            st.session_state.chat_history.append({"role": "assistant", "content": full_response})
             thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
             if thinking:
                 with st.expander("🧠 Model's Internal Reasoning"):
                     for t in thinking:
                         st.markdown(t.strip())

 import shutil
 import re
+# Set a threshold for required free storage (in bytes) before downloading a new model.
+# Adjust this value according to the expected size of your models.
+REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
 # Available models
 MODELS = {
     "Qwen2.5-7B-Instruct (Q2_K)": {
     if st.button("📦 Show Disk Usage"):
         try:
             usage = shutil.disk_usage(".")
+            used = usage.used / (1024 ** 3)
+            free = usage.free / (1024 ** 3)
             st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
         except Exception as e:
             st.error(f"Disk usage error: {e}")
 selected_model = MODELS[selected_model_name]
 model_path = os.path.join("models", selected_model["filename"])
+# Initialize session state variables if not present
 if "model_name" not in st.session_state:
     st.session_state.model_name = None
 if "llm" not in st.session_state:
     st.session_state.llm = None
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+if "pending_response" not in st.session_state:
+    st.session_state.pending_response = False
 # Ensure model directory exists
 os.makedirs("models", exist_ok=True)
 def try_load_model(path):
     try:
+        return Llama(
+            model_path=path,
+            n_ctx=1024,
+            n_threads=2,
+            n_threads_batch=2,
+            n_batch=4,
+            n_gpu_layers=0,
+            use_mlock=False,
+            use_mmap=True,
+            verbose=False,
+        )
     except Exception as e:
         return str(e)
 def validate_or_download_model():
+    # Download model if it doesn't exist locally.
     if not os.path.exists(model_path):
+        # Check free space and cleanup old models only if free space is insufficient.
+        free_space = shutil.disk_usage(".").free
+        if free_space < REQUIRED_SPACE_BYTES:
+            st.info("Insufficient storage detected. Cleaning up old models to free up space.")
+            cleanup_old_models()
         download_model()
     result = try_load_model(model_path)
         st.warning(f"Initial load failed: {result}\nAttempting re-download...")
         try:
             os.remove(model_path)
+        except Exception:
             pass
+        # Check storage again before re-downloading.
+        free_space = shutil.disk_usage(".").free
+        if free_space < REQUIRED_SPACE_BYTES:
+            st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
+            cleanup_old_models()
         download_model()
         result = try_load_model(model_path)
         if isinstance(result, str):
 llm = st.session_state.llm
+# Display title and caption
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
+# Render the full chat history
+for chat in st.session_state.chat_history:
+    with st.chat_message(chat["role"]):
+        st.markdown(chat["content"])
+        # For assistant messages, if there's internal reasoning, display it behind an expander
+        if chat.get("role") == "assistant" and chat.get("thinking"):
+            with st.expander("🧠 Model's Internal Reasoning"):
+                for t in chat["thinking"]:
+                    st.markdown(t.strip())
+# Chat input widget
 user_input = st.chat_input("Ask something...")
 if user_input:
+    # Block new input if a response is still pending
+    if st.session_state.pending_response:
+        st.warning("Please wait for the assistant to finish responding.")
     else:
+        # Append and render the user's message
         st.session_state.chat_history.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
+        # Mark that we are waiting for a response
+        st.session_state.pending_response = True
         MAX_TURNS = 8
+        # Use the latest MAX_TURNS * 2 messages (system prompt plus conversation)
+        trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
         messages = [{"role": "system", "content": system_prompt}] + trimmed_history
+        # Create a container for the assistant's streaming message with two placeholders:
+        # one for visible output and one for the think part.
         with st.chat_message("assistant"):
+            visible_placeholder = st.empty()
+            thinking_placeholder = st.empty()
             full_response = ""
             stream = llm.create_chat_completion(
                 messages=messages,
                 repeat_penalty=repeat_penalty,
                 stream=True,
             )
+            # Stream and update the assistant's message in real time
             for chunk in stream:
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
                     full_response += delta
+                    # Update visible response by filtering out think parts
+                    visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
+                    visible_placeholder.markdown(visible_response)
+                    # Extract and pretty format internal reasoning (if any) while streaming
+                    thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
+                    if thinking:
+                        thinking_display = "\n\n".join(f"- {t.strip()}" for t in thinking)
+                        thinking_placeholder.markdown(f"**Internal Reasoning (in progress):**\n\n{thinking_display}")
+                    else:
+                        thinking_placeholder.empty()
+            # After streaming completes, process the final full response:
             visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
             thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
+            st.session_state.chat_history.append({
+                "role": "assistant",
+                "content": visible_response,
+                "thinking": thinking
+            })
+            # Display the final internal reasoning behind an expander if available
             if thinking:
                 with st.expander("🧠 Model's Internal Reasoning"):
                     for t in thinking:
                         st.markdown(t.strip())
+        # Clear the pending flag once done
+        st.session_state.pending_response = False