ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on May 2

Commit

8c3c2b9

1 Parent(s): c09049b

support thinking models and streamingly display thought

Browse files

Files changed (1) hide show

app.py +59 -10

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import gc
 import threading
 from itertools import islice
 from datetime import datetime
 import gradio as gr
 import torch
 from transformers import pipeline, TextIteratorStreamer
@@ -98,7 +99,7 @@ def retrieve_context(query, max_results=6, max_chars=600):
 def format_conversation(history, system_prompt, tokenizer):
     if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
         messages = [{"role": "system", "content": system_prompt.strip()}] + history
-        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     else:
         # Fallback for base LMs without chat template
         prompt = system_prompt.strip() + "\n"
@@ -178,25 +179,73 @@ def chat_response(user_msg, chat_history, system_prompt,
                 'top_p': top_p,
                 'repetition_penalty': repeat_penalty,
                 'streamer': streamer,
-                'return_full_text': False
             }
         )
         gen_thread.start()
-        assistant_text = ''
-        # Prepare assistant placeholder
-        history.append({'role': 'assistant', 'content': ''})
         for chunk in streamer:
             if cancel_event.is_set():
                 break
-            assistant_text += chunk
-            history[-1]['content'] = assistant_text
-            # Show debug only once
-            yield history, debug # ← Show search results during streaming
         gen_thread.join()
         yield history, debug + prompt_debug
     except Exception as e:
-        history[-1]['content'] = f"Error: {e}"
         yield history, debug
     finally:
         gc.collect()

 import threading
 from itertools import islice
 from datetime import datetime
+import re  # for parsing <think> blocks
 import gradio as gr
 import torch
 from transformers import pipeline, TextIteratorStreamer
 def format_conversation(history, system_prompt, tokenizer):
     if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
         messages = [{"role": "system", "content": system_prompt.strip()}] + history
+        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
     else:
         # Fallback for base LMs without chat template
         prompt = system_prompt.strip() + "\n"
                 'top_p': top_p,
                 'repetition_penalty': repeat_penalty,
                 'streamer': streamer,
+                'return_full_text': False,
             }
         )
         gen_thread.start()
+        # Buffers for thought vs answer
+        thought_buf = ''
+        answer_buf = ''
+        in_thought = False
+        # Stream tokens
         for chunk in streamer:
             if cancel_event.is_set():
                 break
+            text = chunk
+            # Detect start of thinking
+            if not in_thought and '<think>' in text:
+                in_thought = True
+                # Insert thought placeholder
+                history.append({
+                    'role': 'assistant',
+                    'content': '',
+                    'metadata': {'title': '💭 Thought'}
+                })
+                # Capture after opening tag
+                after = text.split('<think>', 1)[1]
+                thought_buf += after
+                # If closing tag in same chunk
+                if '</think>' in thought_buf:
+                    before, after2 = thought_buf.split('</think>', 1)
+                    history[-1]['content'] = before.strip()
+                    in_thought = False
+                    # Start answer buffer
+                    answer_buf = after2
+                    history.append({'role': 'assistant', 'content': answer_buf})
+                else:
+                    history[-1]['content'] = thought_buf
+                yield history, debug
+                continue
+            # Continue thought streaming
+            if in_thought:
+                thought_buf += text
+                if '</think>' in thought_buf:
+                    before, after2 = thought_buf.split('</think>', 1)
+                    history[-1]['content'] = before.strip()
+                    in_thought = False
+                    # Start answer buffer
+                    answer_buf = after2
+                    history.append({'role': 'assistant', 'content': answer_buf})
+                else:
+                    history[-1]['content'] = thought_buf
+                yield history, debug
+                continue
+            # Stream answer
+            if not answer_buf:
+                history.append({'role': 'assistant', 'content': ''})
+            answer_buf += text
+            history[-1]['content'] = answer_buf
+            yield history, debug
         gen_thread.join()
         yield history, debug + prompt_debug
     except Exception as e:
+        history.append({'role': 'assistant', 'content': f"Error: {e}"})
         yield history, debug
     finally:
         gc.collect()