import torch import gradio as gr from dataclasses import asdict from smolagents import CodeAgent, TransformersModel, InferenceClientModel, stream_to_gradio from transformers import BitsAndBytesConfig from tools import get_weather, CurrencyConverterTool model_path = "Qwen/Qwen3-4B-Instruct-2507" cuda = torch.cuda.is_available() if cuda: print("\nRunning on Local GPU\n") else: print("\nRunning on Hugging Face Ecosystem\n") def interact_with_agent( message, history: list[dict[str, str]], system_message, max_tokens, temperature, top_p, hf_token: gr.OAuthToken, ): if cuda: quantization = BitsAndBytesConfig(load_in_8bit=True) model = TransformersModel( model_id=model_path, max_new_tokens=1024, temperature=temperature, hf_token=hf_token.token, top_p=top_p, max_tokens=max_tokens, system_message=system_message, model_kwargs={ "quantization_config": quantization }) else: model = InferenceClientModel( token=hf_token.token, model_id=model_path ) agent = CodeAgent( tools=[ get_weather, CurrencyConverterTool() ], model=model, max_steps=8, verbosity_level=2, add_base_tools=True ) messages = [] yield messages for msg in stream_to_gradio(agent, message): messages.append(asdict(msg)) yield messages yield messages chatbot = gr.ChatInterface( interact_with_agent, type="messages", additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) with gr.Blocks() as demo: with gr.Sidebar(): gr.LoginButton() chatbot.render() if __name__ == "__main__": demo.launch()