Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import re | |
| import json | |
| from typing import List, Dict, Any, Optional | |
| import logging | |
| import spaces | |
| import os | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Model configuration | |
| MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft" # Main repo for config and chat template | |
| INT4_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft/int4" # Int4 quantized model | |
| LOCAL_MODEL_PATH = "./int4" # Local int4 weights | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Global variables for model and tokenizer | |
| model = None | |
| tokenizer = None | |
| # Default system prompt | |
| DEFAULT_SYSTEM_PROMPT = "Tu es TonicIA, un assistant francophone rigoureux et bienveillant." | |
| # Title and description content | |
| title = "# π€ Petite Elle L'Aime 3 - Chat Interface" | |
| description = "A fine-tuned version of SmolLM3-3B optimized for French and multilingual conversations. This is the int4 quantized version for efficient CPU deployment." | |
| presentation1 = """ | |
| ### π― Features | |
| - **Multilingual Support**: English, French, Italian, Portuguese, Chinese, Arabic | |
| - **Int4 Quantization**: Optimized for CPU deployment with ~50% memory reduction | |
| - **Interactive Chat Interface**: Real-time conversation with the model | |
| - **Customizable System Prompt**: Define the assistant's personality and behavior | |
| - **Thinking Mode**: Enable reasoning mode with thinking tags | |
| """ | |
| presentation2 = """ | |
| ### π Model Information | |
| - **Base Model**: SmolLM3-3B | |
| - **Parameters**: ~3B | |
| - **Context Length**: 128k | |
| - **Languages**: English, French, Italian, Portuguese, Chinese, Arabic | |
| - **Device**: CPU optimized | |
| - **Quantization**: int4 | |
| """ | |
| joinus = """ | |
| ### π Quick Start | |
| 1. Add context in the system prompt | |
| 2. Type your message | |
| 3. Click generate to start chatting | |
| 4. Use advanced settings for fine-tuning | |
| """ | |
| def check_local_model(): | |
| """Check if local int4 model files exist""" | |
| required_files = [ | |
| "config.json", | |
| "pytorch_model.bin", | |
| "tokenizer.json", | |
| "tokenizer_config.json" | |
| ] | |
| for file in required_files: | |
| file_path = os.path.join(LOCAL_MODEL_PATH, file) | |
| if not os.path.exists(file_path): | |
| logger.warning(f"Missing required file: {file_path}") | |
| return False | |
| logger.info("All required model files found locally") | |
| return True | |
| def load_model(): | |
| """Load the model and tokenizer""" | |
| global model, tokenizer | |
| try: | |
| # Check if local model exists (downloaded during build) | |
| if check_local_model(): | |
| logger.info(f"Loading tokenizer from {LOCAL_MODEL_PATH}") | |
| tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH) | |
| logger.info(f"Loading int4 model from {LOCAL_MODEL_PATH}") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| LOCAL_MODEL_PATH, | |
| device_map="auto" if DEVICE == "cuda" else "cpu", | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=True | |
| ) | |
| else: | |
| logger.info(f"Local model not found, loading from {MAIN_MODEL_ID}") | |
| # Load tokenizer from main repo (for chat template and config) | |
| tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID) | |
| logger.info(f"Loading int4 model from {INT4_MODEL_ID}") | |
| # Load model with int4 quantization from Hugging Face | |
| model = AutoModelForCausalLM.from_pretrained( | |
| INT4_MODEL_ID, | |
| device_map="auto" if DEVICE == "cuda" else "cpu", | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=True | |
| ) | |
| # Set pad token if not present | |
| if tokenizer.pad_token_id is None: | |
| tokenizer.pad_token_id = tokenizer.eos_token_id | |
| logger.info("Model loaded successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error loading model: {e}") | |
| return False | |
| def create_prompt(system_message, user_message, enable_thinking=True): | |
| """Create prompt using the model's chat template""" | |
| try: | |
| # Prepare messages for the template | |
| formatted_messages = [] | |
| # Add system message if provided | |
| if system_message and system_message.strip(): | |
| formatted_messages.append({"role": "system", "content": system_message}) | |
| # Add user message | |
| formatted_messages.append({"role": "user", "content": user_message}) | |
| # Apply the chat template | |
| prompt = tokenizer.apply_chat_template( | |
| formatted_messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| enable_thinking=enable_thinking | |
| ) | |
| # Add /no_think to the end of prompt when thinking is disabled | |
| if not enable_thinking: | |
| prompt += " /no_think" | |
| return prompt | |
| except Exception as e: | |
| logger.error(f"Error creating prompt: {e}") | |
| return "" | |
| def generate_response(message, history, system_message, max_tokens, temperature, top_p, do_sample, enable_thinking=True): | |
| """Generate response using the model""" | |
| global model, tokenizer | |
| if model is None or tokenizer is None: | |
| return "Error: Model not loaded. Please wait for the model to load." | |
| try: | |
| # Create prompt using chat template | |
| full_prompt = create_prompt(system_message, message, enable_thinking) | |
| if not full_prompt: | |
| return "Error: Failed to create prompt." | |
| # Tokenize the input | |
| inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True) | |
| # Move to device | |
| if DEVICE == "cuda": | |
| inputs = {k: v.cuda() for k, v in inputs.items()} | |
| # Generate response | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| inputs['input_ids'], | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=do_sample, | |
| attention_mask=inputs['attention_mask'], | |
| pad_token_id=tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id | |
| ) | |
| # Decode the response | |
| response = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| # Extract only the new response (remove the input prompt) | |
| assistant_response = response[len(full_prompt):].strip() | |
| # Clean up the response - only remove special tokens, preserve thinking tags when enabled | |
| assistant_response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', assistant_response, flags=re.DOTALL) | |
| # Only remove thinking tags if thinking mode is disabled | |
| if not enable_thinking: | |
| assistant_response = re.sub(r'<think>.*?</think>', '', assistant_response, flags=re.DOTALL) | |
| assistant_response = assistant_response.strip() | |
| return assistant_response | |
| except Exception as e: | |
| logger.error(f"Error generating response: {e}") | |
| return f"Error generating response: {str(e)}" | |
| def user(user_message, history): | |
| """Add user message to history""" | |
| return "", history + [[user_message, None]] | |
| def bot(history, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking): | |
| """Generate bot response""" | |
| user_message = history[-1][0] | |
| do_sample = advanced_checkbox | |
| bot_message = generate_response(user_message, history, system_prompt, max_length, temperature, top_p, do_sample, enable_thinking) | |
| history[-1][1] = bot_message | |
| return history | |
| # Load model on startup | |
| logger.info("Starting model loading process...") | |
| load_model() | |
| # Create Gradio interface | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| gr.Markdown(title) | |
| with gr.Row(): | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Group(): | |
| gr.Markdown(presentation1) | |
| with gr.Column(scale=1): | |
| with gr.Group(): | |
| gr.Markdown(presentation2) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Group(): | |
| gr.Markdown(joinus) | |
| with gr.Column(scale=1): | |
| pass # Empty column for balance | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| system_prompt = gr.TextArea( | |
| label="π Context", | |
| placeholder="Tu es TonicIA, un assistant francophone rigoureux et bienveillant.", | |
| lines=5, | |
| value=DEFAULT_SYSTEM_PROMPT | |
| ) | |
| user_input = gr.TextArea( | |
| label="π€·π»ββοΈ User Input", | |
| placeholder="Hi there my name is Tonic!", | |
| lines=2 | |
| ) | |
| advanced_checkbox = gr.Checkbox(label="π§ͺ Advanced Settings", value=False) | |
| with gr.Column(visible=False) as advanced_settings: | |
| max_length = gr.Slider( | |
| label="π Max Length", | |
| minimum=64, | |
| maximum=2048, | |
| value=512, | |
| step=64 | |
| ) | |
| temperature = gr.Slider( | |
| label="π‘οΈ Temperature", | |
| minimum=0.01, | |
| maximum=1.0, | |
| value=0.7, | |
| step=0.01 | |
| ) | |
| top_p = gr.Slider( | |
| label="βοΈ Top-p (Nucleus Sampling)", | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.01 | |
| ) | |
| enable_thinking = gr.Checkbox(label="Enable Thinking Mode", value=True) | |
| generate_button = gr.Button(value="π€ Petite Elle L'Aime 3") | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot(label="π€ Petite Elle L'Aime 3") | |
| generate_button.click( | |
| user, | |
| [user_input, chatbot], | |
| [user_input, chatbot], | |
| queue=False | |
| ).then( | |
| bot, | |
| [chatbot, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking], | |
| chatbot | |
| ) | |
| advanced_checkbox.change( | |
| fn=lambda x: gr.update(visible=x), | |
| inputs=[advanced_checkbox], | |
| outputs=[advanced_settings] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue() | |
| demo.launch(ssr_mode=False, mcp_server=True) |