|
|
import gradio as gr |
|
|
import time |
|
|
from transformers import T5Tokenizer, T5ForConditionalGeneration |
|
|
from quanto import quantize, freeze, qint8 |
|
|
|
|
|
model_dir = "t5flan" |
|
|
|
|
|
|
|
|
model = T5ForConditionalGeneration.from_pretrained(model_dir) |
|
|
tokenizer = T5Tokenizer.from_pretrained(model_dir) |
|
|
|
|
|
|
|
|
quantized_model = T5ForConditionalGeneration.from_pretrained(model_dir) |
|
|
quantize(quantized_model, weights=qint8, activations=None) |
|
|
freeze(quantized_model) |
|
|
|
|
|
|
|
|
def generate_text(prompt): |
|
|
|
|
|
start_time_normal = time.time() |
|
|
inputs = tokenizer(prompt, return_tensors='pt') |
|
|
outputs_normal = model.generate(**inputs, max_length=100, num_return_sequences=1) |
|
|
generated_text_normal = tokenizer.decode(outputs_normal[0], skip_special_tokens=True) |
|
|
end_time_normal = time.time() |
|
|
response_time_normal = end_time_normal - start_time_normal |
|
|
|
|
|
|
|
|
start_time_quantized = time.time() |
|
|
outputs_quantized = quantized_model.generate(**inputs, max_length=100, num_return_sequences=1) |
|
|
generated_text_quantized = tokenizer.decode(outputs_quantized[0], skip_special_tokens=True) |
|
|
end_time_quantized = time.time() |
|
|
response_time_quantized = end_time_quantized - start_time_quantized |
|
|
|
|
|
return (generated_text_normal, f"{response_time_normal:.2f} seconds", |
|
|
generated_text_quantized, f"{response_time_quantized:.2f} seconds") |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=generate_text, |
|
|
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), |
|
|
outputs=[ |
|
|
gr.Textbox(label="Generated Text (Normal Model)"), |
|
|
gr.Textbox(label="Response Time (Normal Model)"), |
|
|
gr.Textbox(label="Generated Text (Quantized Model)"), |
|
|
gr.Textbox(label="Response Time (Quantized Model)") |
|
|
], |
|
|
title="TinyLlama Text Generation Comparison" |
|
|
) |
|
|
|
|
|
|
|
|
iface.launch() |
|
|
|