File size: 20,002 Bytes
19b19f0
 
d784738
19b19f0
 
 
 
 
 
5b9c598
f3832f5
2ee7774
19b19f0
109031b
 
 
 
 
19b19f0
 
a3113ce
19b19f0
 
 
 
5975026
 
19b19f0
 
 
109031b
19b19f0
 
 
d784738
19b19f0
1a6008e
 
109031b
1a6008e
2ee7774
1a6008e
d784738
19b19f0
 
1a6008e
 
19b19f0
 
d784738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a6008e
9d0240c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19b19f0
109031b
19b19f0
9d0240c
 
 
3436b88
cfb5435
 
 
 
 
55d7c97
cfb5435
2ee7774
109031b
55d7c97
 
fbcc018
55d7c97
1a6008e
3448aad
55d7c97
 
 
cfb5435
 
9d0240c
 
 
 
109031b
19b19f0
9d0240c
 
 
55d7c97
9d0240c
384c439
19b19f0
d784738
 
19b19f0
 
 
d784738
 
 
 
 
 
 
 
 
19b19f0
d784738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19b19f0
 
 
 
 
 
 
109031b
d784738
109031b
19b19f0
 
 
 
2ee7774
d784738
 
 
 
 
 
 
 
 
 
1a6008e
 
 
19b19f0
1a6008e
 
 
 
 
2ee7774
1a6008e
 
 
 
 
 
d784738
1a6008e
 
 
2ee7774
3679e0b
2ee7774
9413e52
 
7578eea
9413e52
0004ba8
 
 
7578eea
9413e52
0004ba8
9413e52
0004ba8
9413e52
 
0004ba8
9413e52
 
 
0004ba8
9413e52
 
0004ba8
9413e52
 
0004ba8
9413e52
 
 
 
0004ba8
9413e52
 
0004ba8
9413e52
 
 
 
 
 
 
7578eea
9413e52
0004ba8
 
 
7578eea
 
0004ba8
 
 
7578eea
19b19f0
7578eea
 
0004ba8
 
 
7578eea
19b19f0
 
7578eea
 
0004ba8
 
 
7578eea
 
0004ba8
 
 
19b19f0
d784738
 
 
 
 
 
 
 
 
 
 
 
 
7578eea
0004ba8
 
 
7578eea
19b19f0
 
7578eea
0004ba8
 
 
 
7578eea
19b19f0
 
 
 
384c439
 
 
19b19f0
d784738
19b19f0
384c439
1a6008e
384c439
 
19b19f0
d784738
 
 
 
 
 
 
384c439
19b19f0
 
 
109031b
19b19f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a6008e
19b19f0
 
 
 
 
1a6008e
 
19b19f0
 
 
 
 
1a6008e
 
7578eea
 
1a6008e
19b19f0
 
1a6008e
19b19f0
 
d784738
19b19f0
 
 
1a6008e
19b19f0
 
d784738
 
 
 
109031b
d784738
 
 
19b19f0
 
1a6008e
d784738
 
 
 
 
 
 
 
 
19b19f0
 
 
 
384c439
19b19f0
 
 
 
 
 
 
 
d784738
19b19f0
 
 
 
 
 
 
 
d784738
 
 
 
 
 
19b19f0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import json
from typing import List, Dict, Any, Optional
import logging
import spaces
import os
import sys
import requests
import accelerate

# Set torch to use float16 on GPU for better performance, float32 on CPU for compatibility
if torch.cuda.is_available():
    torch.set_default_dtype(torch.float16)
else:
    torch.set_default_dtype(torch.float32)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"  
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = None
tokenizer = None
DEFAULT_SYSTEM_PROMPT = "Tu es TonicIA, un assistant francophone rigoureux et bienveillant."
title = "# 🙋🏻‍♂️Welcome to 🌟Tonic's Petite Elle L'Aime 3"
description = "A fine-tuned version of SmolLM3-3B optimized for French conversations."
presentation1 = """
### 🎯 Features
- **Multilingual Support**: English, French, Italian, Portuguese, Chinese, Arabic
- **Full Fine-Tuned Model**: Maximum performance and quality with full precision
- **Interactive Chat Interface**: Real-time conversation with the model
- **Customizable System Prompt**: Define the assistant's personality and behavior
- **Thinking Mode**: Enable reasoning mode with thinking tags
- **Tool Calling**: Support for function calling with XML and Python tools
"""
presentation2 = """### 🎯 Fonctionnalités
* **Support multilingue** : Anglais, Français, Italien, Portugais, Chinois, Arabe
* **Modèle complet fine-tuné** : Performance et qualité maximales avec précision complète
* **Interface de chat interactive** : Conversation en temps réel avec le modèle
* **Invite système personnalisable** : Définissez la personnalité et le comportement de l'assistant
* **Mode Réflexion** : Activez le mode raisonnement avec des balises de réflexion
* **Appel d'outils** : Support pour l'appel de fonctions avec XML et Python
"""
joinus = """
## Join us :
🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
"""

# Default tool definition for demonstration
DEFAULT_TOOLS = [
    {
        "name": "get_weather",
        "description": "Get the weather in a city",
        "parameters": {
            "type": "object", 
            "properties": {
                "city": {
                    "type": "string", 
                    "description": "The city to get the weather for"
                }
            }
        }
    },
    {
        "name": "calculate",
        "description": "Perform mathematical calculations",
        "parameters": {
            "type": "object",
            "properties": {
                "expression": {
                    "type": "string",
                    "description": "Mathematical expression to evaluate"
                }
            }
        }
    }
]


def download_chat_template():
    """Download the chat template from the main repository"""
    try:
        chat_template_url = f"https://huggingface.co/{MAIN_MODEL_ID}/raw/main/chat_template.jinja"
        logger.info(f"Downloading chat template from {chat_template_url}")
        
        response = requests.get(chat_template_url, timeout=30)
        response.raise_for_status()
        
        chat_template_content = response.text
        logger.info("Chat template downloaded successfully")
        return chat_template_content
        
    except requests.exceptions.RequestException as e:
        logger.error(f"Error downloading chat template: {e}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error downloading chat template: {e}")
        return None


def load_model():
    """Load the full fine-tuned model and tokenizer"""
    global model, tokenizer
    
    try:
        logger.info(f"Loading tokenizer from {MAIN_MODEL_ID}")
        # tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID, subfolder="int4")
        tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID)
        # chat_template = download_chat_template()
        # if chat_template:
        #     tokenizer.chat_template = chat_template
        # logger.info("Chat template downloaded and set successfully")

        # logger.info(f"Loading full fine-tuned model from {MAIN_MODEL_ID}")
        
        # Load the full fine-tuned model with optimized settings
        model_kwargs = {
            "device_map": "auto" if DEVICE == "cuda" else "cpu",
            "torch_dtype": torch.bfloat16 if DEVICE == "cuda" else torch.float32,  # Use float16 on GPU, float32 on CPU
            "trust_remote_code": True,
            "low_cpu_mem_usage": True,
        #   "attn_implementation": "flash_attention_2" if DEVICE == "cuda" else "eager"
        }
        
        logger.info(f"Model loading parameters: {model_kwargs}")
        # model = AutoModelForCausalLM.from_pretrained(MAIN_MODEL_ID, subfolder="int4", **model_kwargs)
        model = AutoModelForCausalLM.from_pretrained(MAIN_MODEL_ID, **model_kwargs)
        
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id
            
        logger.info("Full fine-tuned model loaded successfully")
        return True
        
    except Exception as e:
        logger.error(f"Error loading model: {e}")
        logger.error(f"Model config: {model.config if model else 'Model not loaded'}")
        return False


def create_prompt(system_message, user_message, enable_thinking=True, tools=None, use_xml_tools=True):
    """Create prompt using the model's chat template with SmolLM3 features"""
    try:
        formatted_messages = []
        if system_message and system_message.strip():
            # Check if thinking flags are already present
            has_think_flag = "/think" in system_message
            has_no_think_flag = "/no_think" in system_message
            
            # Add thinking flag to system message if needed
            if not enable_thinking and not has_no_think_flag:
                system_message += "/no_think"
            elif enable_thinking and not has_think_flag and not has_no_think_flag:
                system_message += "/think"
            formatted_messages.append({"role": "system", "content": system_message})
        
        formatted_messages.append({"role": "user", "content": user_message})
        
        # Apply chat template with SmolLM3 features
        template_kwargs = {
            "tokenize": False,
            "add_generation_prompt": True,
            "enable_thinking": enable_thinking
        }
        
        # Add tool calling if tools are provided
        if tools and len(tools) > 0:
            if use_xml_tools:
                template_kwargs["xml_tools"] = tools
            else:
                template_kwargs["python_tools"] = tools
        
        prompt = tokenizer.apply_chat_template(formatted_messages, **template_kwargs)
        
        return prompt
        
    except Exception as e:
        logger.error(f"Error creating prompt: {e}")
        return ""

@spaces.GPU()
def generate_response(message, history, system_message, max_tokens, temperature, top_p, repetition_penalty, do_sample, enable_thinking=True, tools=None, use_xml_tools=True):
    """Generate response using the full fine-tuned model with SmolLM3 features"""
    global model, tokenizer
    
    if model is None or tokenizer is None:
        return "Error: Model not loaded. Please wait for the model to load."
    
    # Parse tools from string if provided
    parsed_tools = None
    if tools and tools.strip():
        try:
            parsed_tools = json.loads(tools)
        except json.JSONDecodeError as e:
            logger.error(f"Error parsing tools JSON: {e}")
            return "Error: Invalid tool definition JSON format."
    
    full_prompt = create_prompt(system_message, message, enable_thinking, parsed_tools, use_xml_tools)
     
    if not full_prompt:
        return "Error: Failed to create prompt."
        
    inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)        
    logger.info(f"Input tensor shapes: {[(k, v.shape, v.dtype) for k, v in inputs.items()]}")

    if DEVICE == "cuda":
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        output_ids = model.generate(
            inputs['input_ids'],
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            do_sample=do_sample,
            attention_mask=inputs['attention_mask'],
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            # cache_implementation="static"
        )
        # First decode WITH special tokens to find markers
        response_with_tokens = tokenizer.decode(output_ids[0], skip_special_tokens=False)
        
        # Debug: Print the full raw response with tokens
        # logger.info(f"=== FULL RAW RESPONSE WITH TOKENS DEBUG ===")
        # logger.info(f"Raw response with tokens length: {len(response_with_tokens)}")
        # logger.info(f"Raw response with tokens: {repr(response_with_tokens)}")
        
        # More robust response extraction - look for assistant marker
        # logger.info(f"Looking for assistant marker in response...")
        if "<|im_start|>assistant" in response_with_tokens:
            # logger.info(f"Found assistant marker in response")
            # Find the start of assistant response
            assistant_start = response_with_tokens.find("<|im_start|>assistant")
            # logger.info(f"Assistant marker found at position: {assistant_start}")
            if assistant_start != -1:
                # Find the end of the assistant marker
                marker_end = response_with_tokens.find("\n", assistant_start)
                # logger.info(f"Marker end found at position: {marker_end}")
                if marker_end != -1:
                    assistant_response = response_with_tokens[marker_end + 1:].strip()
                    # logger.info(f"Using marker-based extraction")
                else:
                    assistant_response = response_with_tokens[assistant_start + len("<|im_start|>assistant"):].strip()
                    # logger.info(f"Using fallback marker extraction")
            else:
                # Fallback to prompt-based extraction
                response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
                assistant_response = response[len(full_prompt):].strip()
                # logger.info(f"Using prompt-based extraction (marker not found)")
        else:
            # Fallback to original method
            # logger.info(f"No assistant marker found, using prompt-based extraction")
            response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            assistant_response = response[len(full_prompt):].strip()
        
        # Clean up any remaining special tokens
        assistant_response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', assistant_response, flags=re.DOTALL)
        assistant_response = re.sub(r'<\|im_start\|>', '', assistant_response)
        assistant_response = re.sub(r'<\|im_end\|>', '', assistant_response)
        
        # Debug: Print the extracted assistant response after cleanup
        # logger.info(f"=== EXTRACTED ASSISTANT RESPONSE AFTER CLEANUP DEBUG ===")
        # logger.info(f"Extracted response length: {len(assistant_response)}")
        # logger.info(f"Extracted response: {repr(assistant_response)}")
        
        # Debug: Print before cleanup
        # logger.info(f"=== BEFORE CLEANUP DEBUG ===")
        # logger.info(f"Before cleanup length: {len(assistant_response)}")
        # logger.info(f"Before cleanup: {repr(assistant_response)}")
        
        assistant_response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', assistant_response, flags=re.DOTALL)
        
        # Debug: Print after first cleanup
        # logger.info(f"=== AFTER FIRST CLEANUP DEBUG ===")
        # logger.info(f"After first cleanup length: {len(assistant_response)}")
        # logger.info(f"After first cleanup: {repr(assistant_response)}")
        
        if not enable_thinking:
            assistant_response = re.sub(r'<think>.*?</think>', '', assistant_response, flags=re.DOTALL)
            
            # Debug: Print after thinking cleanup
            # logger.info(f"=== AFTER THINKING CLEANUP DEBUG ===")
            # logger.info(f"After thinking cleanup length: {len(assistant_response)}")
            # logger.info(f"After thinking cleanup: {repr(assistant_response)}")
        
        # Debug: Print before tool call handling
        # logger.info(f"=== BEFORE TOOL CALL HANDLING DEBUG ===")
        # logger.info(f"Before tool call handling length: {len(assistant_response)}")
        # logger.info(f"Before tool call handling: {repr(assistant_response)}")
        
        # Handle tool calls if present
        if parsed_tools and ("<tool_call>" in assistant_response or "<code>" in assistant_response):
            if "<tool_call>" in assistant_response:
                tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', assistant_response, re.DOTALL)
                if tool_call_match:
                    tool_call = tool_call_match.group(1)
                    assistant_response += f"\n\n🔧 Tool Call Detected: {tool_call}\n\nNote: This is a simulated tool call. In a real scenario, the tool would be executed and its output would be used to generate a final response."
            elif "<code>" in assistant_response:
                code_match = re.search(r'<code>(.*?)</code>', assistant_response, re.DOTALL)
                if code_match:
                    code_call = code_match.group(1)
                    assistant_response += f"\n\n🐍 Python Tool Call: {code_call}\n\nNote: This is a simulated Python tool call. In a real scenario, the function would be executed and its output would be used to generate a final response."
        
        # Debug: Print after tool call handling
        # logger.info(f"=== AFTER TOOL CALL HANDLING DEBUG ===")
        # logger.info(f"After tool call handling length: {len(assistant_response)}")
        # logger.info(f"After tool call handling: {repr(assistant_response)}")
        
        assistant_response = assistant_response.strip()
        
        # Debug: Print final response
        # logger.info(f"=== FINAL RESPONSE DEBUG ===")
        # logger.info(f"Final response length: {len(assistant_response)}")
        # logger.info(f"Final response: {repr(assistant_response)}")
        # logger.info(f"=== END DEBUG ===")
        
        return assistant_response

def user(user_message, history):
    """Add user message to history"""
    if history is None:
        history = []
    return "", history + [{"role": "user", "content": user_message}]

def bot(history, system_prompt, max_length, temperature, top_p, repetition_penalty, advanced_checkbox, enable_thinking, tools, use_xml_tools, use_tools):
    """Generate bot response"""
    if not history:
        return history    
    user_message = history[-1]["content"] if history else ""
    
    do_sample = advanced_checkbox
    
    tools_to_use = tools if use_tools else None
    
    bot_message = generate_response(
        user_message, history, system_prompt, max_length, temperature, top_p, repetition_penalty,
        do_sample, enable_thinking, tools_to_use, use_xml_tools
    )
    history.append({"role": "assistant", "content": bot_message})
    return history

# Load model on startup
logger.info("Starting model loading process with full fine-tuned model...")
load_model()

# Create Gradio interface
with gr.Blocks() as demo:
    with gr.Row(): 
        gr.Markdown(title)
    with gr.Row():
        gr.Markdown(description)
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown(presentation1)
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown(presentation2)
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown(joinus)
        with gr.Column(scale=1):
            pass  # Empty column for balance
    
    with gr.Row():
        with gr.Column(scale=2):
            system_prompt = gr.TextArea(
                label="📑 Contexte", 
                placeholder="Tu es TonicIA, un assistant francophone rigoureux et bienveillant.", 
                lines=5,
                value=DEFAULT_SYSTEM_PROMPT
            )
            user_input = gr.TextArea(
                label="🤷🏻‍♂️ Message", 
                placeholder="Bonjour je m'appel Tonic!", 
                lines=2
            )
            advanced_checkbox = gr.Checkbox(label="🧪 Advanced Settings", value=False)
            with gr.Column(visible=False) as advanced_settings:
                max_length = gr.Slider(
                    label="📏 Longueur de la réponse", 
                    minimum=10, 
                    maximum=9000,   # maximum=32768,
                    value=1256, 
                    step=1
                )
                temperature = gr.Slider(
                    label="🌡️ Température", 
                    minimum=0.01, 
                    maximum=1.0, 
                    value=0.6,  # Updated to SmolLM3 recommended
                    step=0.01
                )
                top_p = gr.Slider(
                    label="⚛️ Top-p (Echantillonnage)", 
                    minimum=0.1, 
                    maximum=1.0, 
                    value=0.95,  
                    step=0.01
                )
                repetition_penalty = gr.Slider(
                    label="🔄 Pénalité de Répétition", 
                    minimum=1.0, 
                    maximum=2.0, 
                    value=1.1, 
                    step=0.01
                )
                enable_thinking = gr.Checkbox(label="Mode Réflexion", value=True)
                use_tools = gr.Checkbox(label="🔧 Enable Tool Calling", value=False)
                use_xml_tools = gr.Checkbox(label="📋 Use XML Tools (vs Python)", value=True)
                with gr.Column(visible=False) as tool_options:
                    tools = gr.Code(
                        label="Tool Definition (JSON)",
                        value=json.dumps(DEFAULT_TOOLS, indent=2),
                        lines=15,
                        language="json"
                    )
            
            generate_button = gr.Button(value="🤖 Petite Elle L'Aime 3")

        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="🤖 Petite Elle L'Aime 3", type="messages", value=[])
    
    generate_button.click(
        user,
        [user_input, chatbot],
        [user_input, chatbot],
        queue=False
    ).then(
        bot,
        [chatbot, system_prompt, max_length, temperature, top_p, repetition_penalty, advanced_checkbox, enable_thinking, tools, use_xml_tools, use_tools],
        chatbot
    )

    advanced_checkbox.change(
        fn=lambda x: gr.update(visible=x),
        inputs=[advanced_checkbox],
        outputs=[advanced_settings]
    )
    
    use_tools.change(
        fn=lambda x: gr.update(visible=x),
        inputs=[use_tools],
        outputs=[tool_options]
    )

if __name__ == "__main__":
    demo.queue()
    demo.launch(ssr_mode=False, mcp_server=True)