import gradio as gr import time from transformers import T5Tokenizer, T5ForConditionalGeneration from quanto import quantize, freeze, qint8 model_dir = "t5flan" # Load the quantized model and tokenizer model = T5ForConditionalGeneration.from_pretrained(model_dir) tokenizer = T5Tokenizer.from_pretrained(model_dir) ################### Modify this to add quantization of the model ############################## quantized_model = T5ForConditionalGeneration.from_pretrained(model_dir) quantize(quantized_model, weights=qint8, activations=None) freeze(quantized_model) # Define the inference function def generate_text(prompt): # Measure time and generate text for the normal model start_time_normal = time.time() inputs = tokenizer(prompt, return_tensors='pt') outputs_normal = model.generate(**inputs, max_length=100, num_return_sequences=1) generated_text_normal = tokenizer.decode(outputs_normal[0], skip_special_tokens=True) end_time_normal = time.time() response_time_normal = end_time_normal - start_time_normal # Measure time and generate text for the quantized model start_time_quantized = time.time() outputs_quantized = quantized_model.generate(**inputs, max_length=100, num_return_sequences=1) generated_text_quantized = tokenizer.decode(outputs_quantized[0], skip_special_tokens=True) end_time_quantized = time.time() response_time_quantized = end_time_quantized - start_time_quantized return (generated_text_normal, f"{response_time_normal:.2f} seconds", generated_text_quantized, f"{response_time_quantized:.2f} seconds") # Create a Gradio interface iface = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=[ gr.Textbox(label="Generated Text (Normal Model)"), gr.Textbox(label="Response Time (Normal Model)"), gr.Textbox(label="Generated Text (Quantized Model)"), gr.Textbox(label="Response Time (Quantized Model)") ], title="TinyLlama Text Generation Comparison" ) # Launch the interface iface.launch()