ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Apr 10

Commit

4e60755

1 Parent(s): 9d3ca6c

Add internet search feature

Browse files

Files changed (2) hide show

app.py +95 -101
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import streamlit as st
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-import os
-import gc
-import shutil
-import re
 # ----- Custom CSS for pretty formatting of internal reasoning -----
 CUSTOM_CSS = """
@@ -33,8 +32,40 @@ st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
 # ----- Set a threshold for required free storage (in bytes) -----
 REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
 # ----- Available models -----
 MODELS = {
     "Qwen2.5-7B-Instruct (Q2_K)": {
         "repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
         "filename": "qwen2.5-7b-instruct-q2_k.gguf",
@@ -76,22 +107,16 @@ MODELS = {
 with st.sidebar:
     st.header("⚙️ Settings")
     selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
-    system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
-    max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
     temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
     top_k = st.slider("Top-K", 1, 100, 40)
     top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
     repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
-    if st.button("🧹 Clear All Cached Models"):
-        try:
-            for f in os.listdir("models"):
-                if f.endswith(".gguf"):
-                    os.remove(os.path.join("models", f))
-            st.success("Model cache cleared.")
-        except Exception as e:
-            st.error(f"Failed to clear models: {e}")
     if st.button("📦 Show Disk Usage"):
         try:
             usage = shutil.disk_usage(".")
@@ -101,49 +126,22 @@ with st.sidebar:
         except Exception as e:
             st.error(f"Disk usage error: {e}")
-# ----- Model info -----
 selected_model = MODELS[selected_model_name]
 model_path = os.path.join("models", selected_model["filename"])
-# ----- Session state initialization -----
-if "model_name" not in st.session_state:
-    st.session_state.model_name = None
-if "llm" not in st.session_state:
-    st.session_state.llm = None
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
-if "pending_response" not in st.session_state:
-    st.session_state.pending_response = False
-# ----- Ensure model directory exists -----
 os.makedirs("models", exist_ok=True)
-# ----- Functions for model management -----
-def cleanup_old_models():
-    for f in os.listdir("models"):
-        if f.endswith(".gguf") and f != selected_model["filename"]:
-            try:
-                os.remove(os.path.join("models", f))
-            except Exception as e:
-                st.warning(f"Couldn't delete old model {f}: {e}")
-def download_model():
-    with st.spinner(f"Downloading {selected_model['filename']}..."):
-        hf_hub_download(
-            repo_id=selected_model["repo_id"],
-            filename=selected_model["filename"],
-            local_dir="./models",
-            local_dir_use_symlinks=False,  # Deprecated parameter; harmless warning.
-        )
 def try_load_model(path):
     try:
         return Llama(
             model_path=path,
-            n_ctx=1024,
-            n_threads=2,
-            n_threads_batch=2,
-            n_batch=4,
             n_gpu_layers=0,
             use_mlock=False,
             use_mmap=True,
@@ -152,15 +150,21 @@ def try_load_model(path):
     except Exception as e:
         return str(e)
 def validate_or_download_model():
-    # Download model if not present locally.
     if not os.path.exists(model_path):
         free_space = shutil.disk_usage(".").free
         if free_space < REQUIRED_SPACE_BYTES:
-            st.info("Insufficient storage detected. Cleaning up old models to free up space.")
-            cleanup_old_models()
         download_model()
     result = try_load_model(model_path)
     if isinstance(result, str):
         st.warning(f"Initial load failed: {result}\nAttempting re-download...")
@@ -168,10 +172,6 @@ def validate_or_download_model():
             os.remove(model_path)
         except Exception:
             pass
-        free_space = shutil.disk_usage(".").free
-        if free_space < REQUIRED_SPACE_BYTES:
-            st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
-            cleanup_old_models()
         download_model()
         result = try_load_model(model_path)
         if isinstance(result, str):
@@ -180,6 +180,16 @@ def validate_or_download_model():
         return result
     return result
 # ----- Load model if changed -----
 if st.session_state.model_name != selected_model_name:
     if st.session_state.llm is not None:
@@ -194,37 +204,51 @@ llm = st.session_state.llm
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
-# ----- Render full chat history -----
 for chat in st.session_state.chat_history:
     with st.chat_message(chat["role"]):
         st.markdown(chat["content"])
-        # For assistant messages, if there's completed internal reasoning, display it behind an expander.
-        if chat.get("role") == "assistant" and chat.get("thinking"):
-            with st.expander("🧠 Model's Internal Reasoning"):
-                for t in chat["thinking"]:
-                    st.markdown(t.strip())
-# ----- Chat input widget -----
 user_input = st.chat_input("Ask something...")
 if user_input:
     if st.session_state.pending_response:
         st.warning("Please wait for the assistant to finish responding.")
     else:
         st.session_state.chat_history.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
         st.session_state.pending_response = True
-        MAX_TURNS = 8
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
-        messages = [{"role": "system", "content": system_prompt}] + trimmed_history
-        # ----- Streaming the assistant response -----
         with st.chat_message("assistant"):
             visible_placeholder = st.empty()
-            thinking_placeholder = st.empty()
             full_response = ""
             stream = llm.create_chat_completion(
                 messages=messages,
@@ -240,41 +264,11 @@ if user_input:
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
                     full_response += delta
-                    # Determine if there is an open (in-progress) <think> block
-                    open_think = re.search(r"<think>([^<]*)$", full_response, flags=re.DOTALL)
-                    in_progress = open_think.group(1).strip() if open_think else ""
-                    # Create the visible response by removing any complete <think>...</think> blocks,
-                    # and also removing any in-progress (unclosed) <think> content.
                     visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
                     visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
                     visible_placeholder.markdown(visible_response)
-                    # If there's an in-progress thinking part, display it in a pretty style
-                    if in_progress:
-                        # You can further format in_progress as you like; here we wrap it in a styled div.
-                        thinking_html = f"""
-                        <div class="chat-assistant">
-                            <strong>Internal Reasoning (in progress):</strong>
-                            <br>{in_progress}
-                        </div>
-                        """
-                        thinking_placeholder.markdown(thinking_html, unsafe_allow_html=True)
-                    else:
-                        thinking_placeholder.empty()
-            # After streaming completes:
-            # Extract all completed <think> blocks (the final internal reasoning that was closed)
-            final_thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
-            # The final visible response: remove any <think> blocks or any in-progress open block.
-            final_visible = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
-            final_visible = re.sub(r"<think>.*$", "", final_visible, flags=re.DOTALL)
-            st.session_state.chat_history.append({
-                "role": "assistant",
-                "content": final_visible,
-                "thinking": final_thinking
-            })
         st.session_state.pending_response = False

 import streamlit as st
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+import os, gc, shutil, re
+from itertools import islice
+from duckduckgo_search import DDGS  # Latest class-based interface :contentReference[oaicite:0]{index=0}
 # ----- Custom CSS for pretty formatting of internal reasoning -----
 CUSTOM_CSS = """
 # ----- Set a threshold for required free storage (in bytes) -----
 REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
+# ----- Function to perform DuckDuckGo search and retrieve concise context -----
+def retrieve_context(query, max_results=2, max_chars_per_result=150):
+    """
+    Query DuckDuckGo for the given search query and return a concatenated context string.
+    Uses the DDGS().text() generator (with region, safesearch, and timelimit parameters)
+    and limits the results using islice. Each result's title and snippet are combined into context.
+    """
+    try:
+        with DDGS() as ddgs:
+            results_gen = ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y")
+            results = list(islice(results_gen, max_results))
+            context = ""
+            if results:
+                for i, result in enumerate(results, start=1):
+                    title = result.get("title", "No Title")
+                    snippet = result.get("body", "")[:max_chars_per_result]
+                    context += f"Result {i}:\nTitle: {title}\nSnippet: {snippet}\n\n"
+            return context.strip()
+    except Exception as e:
+        st.error(f"Error during retrieval: {e}")
+        return ""
 # ----- Available models -----
 MODELS = {
+    "Qwen2.5-0.5B-Instruct (Q4_K_M)": {
+        "repo_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
+        "filename": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
+        "description": "Qwen2.5-0.5B-Instruct (Q4_K_M)"
+    },
+    "Gemma-3.1B-it (Q4_K_M)": {
+        "repo_id": "unsloth/gemma-3-1b-it-GGUF",
+        "filename": "gemma-3-1b-it-Q4_K_M.gguf",
+        "description": "Gemma-3.1B-it (Q4_K_M)"
+    },
     "Qwen2.5-7B-Instruct (Q2_K)": {
         "repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
         "filename": "qwen2.5-7b-instruct-q2_k.gguf",
 with st.sidebar:
     st.header("⚙️ Settings")
     selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
+    system_prompt_base = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
+    max_tokens = st.slider("Max tokens", 64, 1024, 256, step=32)  # Adjust for lower memory usage
     temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
     top_k = st.slider("Top-K", 1, 100, 40)
     top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
     repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
+    # Checkbox to enable the DuckDuckGo search feature (disabled by default)
+    enable_search = st.checkbox("Enable Web Search", value=False)
     if st.button("📦 Show Disk Usage"):
         try:
             usage = shutil.disk_usage(".")
         except Exception as e:
             st.error(f"Disk usage error: {e}")
+# ----- Define selected model and path -----
 selected_model = MODELS[selected_model_name]
 model_path = os.path.join("models", selected_model["filename"])
+# Ensure model directory exists
 os.makedirs("models", exist_ok=True)
+# ----- Helper functions for model management -----
 def try_load_model(path):
     try:
         return Llama(
             model_path=path,
+            n_ctx=512,           # Reduced context window to save memory
+            n_threads=1,         # Fewer threads for resource-constrained environments
+            n_threads_batch=1,
+            n_batch=2,           # Lower batch size to conserve memory
             n_gpu_layers=0,
             use_mlock=False,
             use_mmap=True,
     except Exception as e:
         return str(e)
+def download_model():
+    with st.spinner(f"Downloading {selected_model['filename']}..."):
+        hf_hub_download(
+            repo_id=selected_model["repo_id"],
+            filename=selected_model["filename"],
+            local_dir="./models",
+            local_dir_use_symlinks=False,
+        )
 def validate_or_download_model():
     if not os.path.exists(model_path):
         free_space = shutil.disk_usage(".").free
         if free_space < REQUIRED_SPACE_BYTES:
+            st.info("Insufficient storage. Consider cleaning up old models.")
         download_model()
     result = try_load_model(model_path)
     if isinstance(result, str):
         st.warning(f"Initial load failed: {result}\nAttempting re-download...")
             os.remove(model_path)
         except Exception:
             pass
         download_model()
         result = try_load_model(model_path)
         if isinstance(result, str):
         return result
     return result
+# ----- Session state initialization -----
+if "model_name" not in st.session_state:
+    st.session_state.model_name = None
+if "llm" not in st.session_state:
+    st.session_state.llm = None
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+if "pending_response" not in st.session_state:
+    st.session_state.pending_response = False
 # ----- Load model if changed -----
 if st.session_state.model_name != selected_model_name:
     if st.session_state.llm is not None:
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
+# Render existing chat history
 for chat in st.session_state.chat_history:
     with st.chat_message(chat["role"]):
         st.markdown(chat["content"])
+# ----- Chat input and integrated RAG with memory optimizations -----
 user_input = st.chat_input("Ask something...")
 if user_input:
     if st.session_state.pending_response:
         st.warning("Please wait for the assistant to finish responding.")
     else:
+        # Append the user query to chat history
         st.session_state.chat_history.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
         st.session_state.pending_response = True
+        # Only retrieve search context if search feature is enabled
+        if enable_search:
+            retrieved_context = retrieve_context(user_input, max_results=2, max_chars_per_result=150)
+        else:
+            retrieved_context = ""
+        st.sidebar.markdown("### Retrieved Context" if enable_search else "Web Search Disabled")
+        st.sidebar.text(retrieved_context or "No context found.")
+        # Build an augmented system prompt that includes the retrieved context if available
+        if retrieved_context:
+            augmented_prompt = (
+                "Use the following recent web search context to help answer the query:\n\n"
+                f"{retrieved_context}\n\nUser Query: {user_input}"
+            )
+        else:
+            augmented_prompt = f"User Query: {user_input}"
+        full_system_prompt = system_prompt_base.strip() + "\n\n" + augmented_prompt
+        # Limit conversation history to the last 2 turns
+        MAX_TURNS = 2
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
+        messages = [{"role": "system", "content": full_system_prompt}] + trimmed_history
+        # Generate response with the LLM in a streaming fashion
         with st.chat_message("assistant"):
             visible_placeholder = st.empty()
             full_response = ""
             stream = llm.create_chat_completion(
                 messages=messages,
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
                     full_response += delta
+                    # Clean internal reasoning markers before display
                     visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
                     visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
                     visible_placeholder.markdown(visible_response)
+        st.session_state.chat_history.append({"role": "assistant", "content": full_response})
         st.session_state.pending_response = False
+        gc.collect()  # Trigger garbage collection to free memory

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ llama-cpp-python
 llama-cpp-agent
 huggingface_hub
 streamlit

 llama-cpp-agent
 huggingface_hub
 streamlit
+duckduckgo_search