Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| import datetime | |
| # Page configuration | |
| st.set_page_config( | |
| page_title="Qwen2.5-Coder Chat", | |
| page_icon="π¬", | |
| layout="wide" | |
| ) | |
| # Initialize session state for conversation history | |
| if 'messages' not in st.session_state: | |
| st.session_state.messages = [] | |
| # Cache the model loading | |
| def load_model_and_tokenizer(): | |
| model_name = "Qwen/Qwen2.5-Coder-32B-Instruct" | |
| # Configure quantization | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_8bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=False, | |
| ) | |
| # Load tokenizer and model | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| trust_remote_code=True | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=bnb_config, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| return tokenizer, model | |
| # Main title | |
| st.title("π¬ Qwen2.5-Coder Chat") | |
| # Sidebar settings | |
| with st.sidebar: | |
| st.header("Settings") | |
| max_length = st.slider( | |
| "Maximum Length", | |
| min_value=64, | |
| max_value=4096, | |
| value=512, | |
| step=64, | |
| help="Maximum number of tokens to generate" | |
| ) | |
| temperature = st.slider( | |
| "Temperature", | |
| min_value=0.1, | |
| max_value=2.0, | |
| value=0.7, | |
| step=0.1, | |
| help="Higher values make output more random, lower values more deterministic" | |
| ) | |
| top_p = st.slider( | |
| "Top P", | |
| min_value=0.1, | |
| max_value=1.0, | |
| value=0.9, | |
| step=0.1, | |
| help="Nucleus sampling: higher values consider more tokens, lower values are more focused" | |
| ) | |
| if st.button("Clear Conversation"): | |
| st.session_state.messages = [] | |
| st.rerun() | |
| # Load model with error handling | |
| try: | |
| with st.spinner("Loading model... Please wait..."): | |
| tokenizer, model = load_model_and_tokenizer() | |
| except Exception as e: | |
| st.error(f"Error loading model: {str(e)}") | |
| st.stop() | |
| def generate_response(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9): | |
| """Generate response from the model""" | |
| try: | |
| # Tokenize input | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| # Generate response | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=True, | |
| pad_token_id=tokenizer.pad_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # Decode and return response | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract only the model's response (after the prompt) | |
| response = response[len(prompt):].strip() | |
| return response | |
| except Exception as e: | |
| st.error(f"Error generating response: {str(e)}") | |
| return None | |
| # Display chat history | |
| for message in st.session_state.messages: | |
| with st.chat_message(message["role"]): | |
| st.write(f"{message['content']}\n\n_{message['timestamp']}_") | |
| # Chat input | |
| if prompt := st.chat_input("Ask me anything about coding..."): | |
| # Add user message to chat | |
| timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| st.session_state.messages.append({ | |
| "role": "user", | |
| "content": prompt, | |
| "timestamp": timestamp | |
| }) | |
| # Display user message | |
| with st.chat_message("user"): | |
| st.write(f"{prompt}\n\n_{timestamp}_") | |
| # Generate and display response | |
| with st.chat_message("assistant"): | |
| with st.spinner("Thinking..."): | |
| # Prepare conversation history | |
| conversation = "" | |
| for msg in st.session_state.messages: | |
| if msg["role"] == "user": | |
| conversation += f"Human: {msg['content']}\n" | |
| else: | |
| conversation += f"Assistant: {msg['content']}\n" | |
| conversation += "Assistant:" | |
| response = generate_response( | |
| conversation, | |
| max_new_tokens=max_length, | |
| temperature=temperature, | |
| top_p=top_p | |
| ) | |
| if response: | |
| timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| st.write(f"{response}\n\n_{timestamp}_") | |
| # Add assistant response to chat history | |
| st.session_state.messages.append({ | |
| "role": "assistant", | |
| "content": response, | |
| "timestamp": timestamp | |
| }) |