Spaces:
Running
Running
bugfix
Browse files
app.py
CHANGED
|
@@ -6,6 +6,10 @@ import gc
|
|
| 6 |
import shutil
|
| 7 |
import re
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
# Available models
|
| 10 |
MODELS = {
|
| 11 |
"Qwen2.5-7B-Instruct (Q2_K)": {
|
|
@@ -68,8 +72,8 @@ with st.sidebar:
|
|
| 68 |
if st.button("📦 Show Disk Usage"):
|
| 69 |
try:
|
| 70 |
usage = shutil.disk_usage(".")
|
| 71 |
-
used = usage.used / (1024**3)
|
| 72 |
-
free = usage.free / (1024**3)
|
| 73 |
st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
|
| 74 |
except Exception as e:
|
| 75 |
st.error(f"Disk usage error: {e}")
|
|
@@ -78,11 +82,15 @@ with st.sidebar:
|
|
| 78 |
selected_model = MODELS[selected_model_name]
|
| 79 |
model_path = os.path.join("models", selected_model["filename"])
|
| 80 |
|
| 81 |
-
#
|
| 82 |
if "model_name" not in st.session_state:
|
| 83 |
st.session_state.model_name = None
|
| 84 |
if "llm" not in st.session_state:
|
| 85 |
st.session_state.llm = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
# Ensure model directory exists
|
| 88 |
os.makedirs("models", exist_ok=True)
|
|
@@ -107,13 +115,28 @@ def download_model():
|
|
| 107 |
|
| 108 |
def try_load_model(path):
|
| 109 |
try:
|
| 110 |
-
return Llama(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
except Exception as e:
|
| 112 |
return str(e)
|
| 113 |
|
| 114 |
def validate_or_download_model():
|
|
|
|
| 115 |
if not os.path.exists(model_path):
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
download_model()
|
| 118 |
|
| 119 |
result = try_load_model(model_path)
|
|
@@ -121,9 +144,13 @@ def validate_or_download_model():
|
|
| 121 |
st.warning(f"Initial load failed: {result}\nAttempting re-download...")
|
| 122 |
try:
|
| 123 |
os.remove(model_path)
|
| 124 |
-
except:
|
| 125 |
pass
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
download_model()
|
| 128 |
result = try_load_model(model_path)
|
| 129 |
if isinstance(result, str):
|
|
@@ -142,29 +169,46 @@ if st.session_state.model_name != selected_model_name:
|
|
| 142 |
|
| 143 |
llm = st.session_state.llm
|
| 144 |
|
| 145 |
-
#
|
| 146 |
-
if "chat_history" not in st.session_state:
|
| 147 |
-
st.session_state.chat_history = []
|
| 148 |
-
|
| 149 |
st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
|
| 150 |
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
user_input = st.chat_input("Ask something...")
|
| 153 |
|
| 154 |
if user_input:
|
| 155 |
-
if
|
| 156 |
-
|
|
|
|
| 157 |
else:
|
|
|
|
| 158 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
| 159 |
-
|
| 160 |
with st.chat_message("user"):
|
| 161 |
st.markdown(user_input)
|
| 162 |
|
|
|
|
|
|
|
|
|
|
| 163 |
MAX_TURNS = 8
|
| 164 |
-
|
|
|
|
| 165 |
messages = [{"role": "system", "content": system_prompt}] + trimmed_history
|
| 166 |
|
|
|
|
|
|
|
| 167 |
with st.chat_message("assistant"):
|
|
|
|
|
|
|
| 168 |
full_response = ""
|
| 169 |
stream = llm.create_chat_completion(
|
| 170 |
messages=messages,
|
|
@@ -175,19 +219,34 @@ if user_input:
|
|
| 175 |
repeat_penalty=repeat_penalty,
|
| 176 |
stream=True,
|
| 177 |
)
|
| 178 |
-
|
| 179 |
for chunk in stream:
|
| 180 |
if "choices" in chunk:
|
| 181 |
delta = chunk["choices"][0]["delta"].get("content", "")
|
| 182 |
full_response += delta
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
| 185 |
-
st.markdown(visible_response)
|
| 186 |
-
|
| 187 |
-
st.session_state.chat_history.append({"role": "assistant", "content": full_response})
|
| 188 |
-
|
| 189 |
thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
if thinking:
|
| 191 |
with st.expander("🧠 Model's Internal Reasoning"):
|
| 192 |
for t in thinking:
|
| 193 |
st.markdown(t.strip())
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
import shutil
|
| 7 |
import re
|
| 8 |
|
| 9 |
+
# Set a threshold for required free storage (in bytes) before downloading a new model.
|
| 10 |
+
# Adjust this value according to the expected size of your models.
|
| 11 |
+
REQUIRED_SPACE_BYTES = 5 * 1024 ** 3 # 5 GB
|
| 12 |
+
|
| 13 |
# Available models
|
| 14 |
MODELS = {
|
| 15 |
"Qwen2.5-7B-Instruct (Q2_K)": {
|
|
|
|
| 72 |
if st.button("📦 Show Disk Usage"):
|
| 73 |
try:
|
| 74 |
usage = shutil.disk_usage(".")
|
| 75 |
+
used = usage.used / (1024 ** 3)
|
| 76 |
+
free = usage.free / (1024 ** 3)
|
| 77 |
st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
|
| 78 |
except Exception as e:
|
| 79 |
st.error(f"Disk usage error: {e}")
|
|
|
|
| 82 |
selected_model = MODELS[selected_model_name]
|
| 83 |
model_path = os.path.join("models", selected_model["filename"])
|
| 84 |
|
| 85 |
+
# Initialize session state variables if not present
|
| 86 |
if "model_name" not in st.session_state:
|
| 87 |
st.session_state.model_name = None
|
| 88 |
if "llm" not in st.session_state:
|
| 89 |
st.session_state.llm = None
|
| 90 |
+
if "chat_history" not in st.session_state:
|
| 91 |
+
st.session_state.chat_history = []
|
| 92 |
+
if "pending_response" not in st.session_state:
|
| 93 |
+
st.session_state.pending_response = False
|
| 94 |
|
| 95 |
# Ensure model directory exists
|
| 96 |
os.makedirs("models", exist_ok=True)
|
|
|
|
| 115 |
|
| 116 |
def try_load_model(path):
|
| 117 |
try:
|
| 118 |
+
return Llama(
|
| 119 |
+
model_path=path,
|
| 120 |
+
n_ctx=1024,
|
| 121 |
+
n_threads=2,
|
| 122 |
+
n_threads_batch=2,
|
| 123 |
+
n_batch=4,
|
| 124 |
+
n_gpu_layers=0,
|
| 125 |
+
use_mlock=False,
|
| 126 |
+
use_mmap=True,
|
| 127 |
+
verbose=False,
|
| 128 |
+
)
|
| 129 |
except Exception as e:
|
| 130 |
return str(e)
|
| 131 |
|
| 132 |
def validate_or_download_model():
|
| 133 |
+
# Download model if it doesn't exist locally.
|
| 134 |
if not os.path.exists(model_path):
|
| 135 |
+
# Check free space and cleanup old models only if free space is insufficient.
|
| 136 |
+
free_space = shutil.disk_usage(".").free
|
| 137 |
+
if free_space < REQUIRED_SPACE_BYTES:
|
| 138 |
+
st.info("Insufficient storage detected. Cleaning up old models to free up space.")
|
| 139 |
+
cleanup_old_models()
|
| 140 |
download_model()
|
| 141 |
|
| 142 |
result = try_load_model(model_path)
|
|
|
|
| 144 |
st.warning(f"Initial load failed: {result}\nAttempting re-download...")
|
| 145 |
try:
|
| 146 |
os.remove(model_path)
|
| 147 |
+
except Exception:
|
| 148 |
pass
|
| 149 |
+
# Check storage again before re-downloading.
|
| 150 |
+
free_space = shutil.disk_usage(".").free
|
| 151 |
+
if free_space < REQUIRED_SPACE_BYTES:
|
| 152 |
+
st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
|
| 153 |
+
cleanup_old_models()
|
| 154 |
download_model()
|
| 155 |
result = try_load_model(model_path)
|
| 156 |
if isinstance(result, str):
|
|
|
|
| 169 |
|
| 170 |
llm = st.session_state.llm
|
| 171 |
|
| 172 |
+
# Display title and caption
|
|
|
|
|
|
|
|
|
|
| 173 |
st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
|
| 174 |
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
|
| 175 |
|
| 176 |
+
# Render the full chat history
|
| 177 |
+
for chat in st.session_state.chat_history:
|
| 178 |
+
with st.chat_message(chat["role"]):
|
| 179 |
+
st.markdown(chat["content"])
|
| 180 |
+
# For assistant messages, if there's internal reasoning, display it behind an expander
|
| 181 |
+
if chat.get("role") == "assistant" and chat.get("thinking"):
|
| 182 |
+
with st.expander("🧠 Model's Internal Reasoning"):
|
| 183 |
+
for t in chat["thinking"]:
|
| 184 |
+
st.markdown(t.strip())
|
| 185 |
+
|
| 186 |
+
# Chat input widget
|
| 187 |
user_input = st.chat_input("Ask something...")
|
| 188 |
|
| 189 |
if user_input:
|
| 190 |
+
# Block new input if a response is still pending
|
| 191 |
+
if st.session_state.pending_response:
|
| 192 |
+
st.warning("Please wait for the assistant to finish responding.")
|
| 193 |
else:
|
| 194 |
+
# Append and render the user's message
|
| 195 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
|
|
|
| 196 |
with st.chat_message("user"):
|
| 197 |
st.markdown(user_input)
|
| 198 |
|
| 199 |
+
# Mark that we are waiting for a response
|
| 200 |
+
st.session_state.pending_response = True
|
| 201 |
+
|
| 202 |
MAX_TURNS = 8
|
| 203 |
+
# Use the latest MAX_TURNS * 2 messages (system prompt plus conversation)
|
| 204 |
+
trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
|
| 205 |
messages = [{"role": "system", "content": system_prompt}] + trimmed_history
|
| 206 |
|
| 207 |
+
# Create a container for the assistant's streaming message with two placeholders:
|
| 208 |
+
# one for visible output and one for the think part.
|
| 209 |
with st.chat_message("assistant"):
|
| 210 |
+
visible_placeholder = st.empty()
|
| 211 |
+
thinking_placeholder = st.empty()
|
| 212 |
full_response = ""
|
| 213 |
stream = llm.create_chat_completion(
|
| 214 |
messages=messages,
|
|
|
|
| 219 |
repeat_penalty=repeat_penalty,
|
| 220 |
stream=True,
|
| 221 |
)
|
| 222 |
+
# Stream and update the assistant's message in real time
|
| 223 |
for chunk in stream:
|
| 224 |
if "choices" in chunk:
|
| 225 |
delta = chunk["choices"][0]["delta"].get("content", "")
|
| 226 |
full_response += delta
|
| 227 |
+
# Update visible response by filtering out think parts
|
| 228 |
+
visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
| 229 |
+
visible_placeholder.markdown(visible_response)
|
| 230 |
+
# Extract and pretty format internal reasoning (if any) while streaming
|
| 231 |
+
thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
|
| 232 |
+
if thinking:
|
| 233 |
+
thinking_display = "\n\n".join(f"- {t.strip()}" for t in thinking)
|
| 234 |
+
thinking_placeholder.markdown(f"**Internal Reasoning (in progress):**\n\n{thinking_display}")
|
| 235 |
+
else:
|
| 236 |
+
thinking_placeholder.empty()
|
| 237 |
+
# After streaming completes, process the final full response:
|
| 238 |
visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
|
| 240 |
+
st.session_state.chat_history.append({
|
| 241 |
+
"role": "assistant",
|
| 242 |
+
"content": visible_response,
|
| 243 |
+
"thinking": thinking
|
| 244 |
+
})
|
| 245 |
+
# Display the final internal reasoning behind an expander if available
|
| 246 |
if thinking:
|
| 247 |
with st.expander("🧠 Model's Internal Reasoning"):
|
| 248 |
for t in thinking:
|
| 249 |
st.markdown(t.strip())
|
| 250 |
+
|
| 251 |
+
# Clear the pending flag once done
|
| 252 |
+
st.session_state.pending_response = False
|