Spaces:
Runtime error
Runtime error
| import os | |
| from pathlib import Path | |
| import requests | |
| import torch | |
| from threading import Event, Thread | |
| from transformers import AutoConfig, AutoTokenizer | |
| from optimum.intel.openvino import OVModelForCausalLM | |
| import openvino as ov | |
| import openvino.properties as props | |
| import openvino.properties.hint as hints | |
| import openvino.properties.streams as streams | |
| import gradio as gr | |
| from llm_config import SUPPORTED_LLM_MODELS | |
| # Initialize model language options | |
| model_languages = list(SUPPORTED_LLM_MODELS) | |
| # Define Gradio interface within a Blocks context | |
| with gr.Blocks() as iface: | |
| # Dropdown for model language selection | |
| model_language = gr.Dropdown( | |
| choices=model_languages, | |
| value=model_languages[0], | |
| label="Model Language" | |
| ) | |
| # Dropdown for model ID, dynamically populated | |
| model_id = gr.Dropdown( | |
| choices=[], # will be populated dynamically | |
| label="Model", | |
| value=None | |
| ) | |
| # Function to update model_id dropdown choices based on model_language | |
| def update_model_id(model_language_value): | |
| model_ids = list(SUPPORTED_LLM_MODELS[model_language_value]) | |
| return gr.Dropdown.update(value=model_ids[0], choices=model_ids) | |
| # Update model_id choices when model_language changes | |
| model_language.change(update_model_id, inputs=model_language, outputs=model_id) | |
| # Checkbox for INT4 model preparation | |
| prepare_int4_model = gr.Checkbox( | |
| value=True, | |
| label="Prepare INT4 Model" | |
| ) | |
| # Checkbox for enabling AWQ (shown conditionally) | |
| enable_awq = gr.Checkbox( | |
| value=False, | |
| label="Enable AWQ", | |
| visible=False # visibility can be controlled in the UI logic | |
| ) | |
| # Dropdown for device selection | |
| device = gr.Dropdown( | |
| choices=["CPU", "GPU"], | |
| value="CPU", | |
| label="Device" | |
| ) | |
| # Function to retrieve model configuration and path | |
| def get_model_path(model_language_value, model_id_value): | |
| model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value] | |
| pt_model_name = model_id_value.split("-")[0] | |
| int4_model_dir = Path(model_id_value) / "INT4_compressed_weights" | |
| return model_configuration, int4_model_dir, pt_model_name | |
| # Function to download the model if not already present | |
| def download_model_if_needed(model_language_value, model_id_value): | |
| model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value) | |
| int4_weights = int4_model_dir / "openvino_model.bin" | |
| if not int4_weights.exists(): | |
| print(f"Downloading model {model_id_value}...") | |
| # Download logic (e.g., requests.get(model_configuration["model_url"])) can go here | |
| return int4_model_dir | |
| # Load the model based on selected options | |
| def load_model(model_language_value, model_id_value): | |
| int4_model_dir = download_model_if_needed(model_language_value, model_id_value) | |
| ov_config = { | |
| hints.performance_mode(): hints.PerformanceMode.LATENCY, | |
| streams.num(): "1", | |
| props.cache_dir(): "" | |
| } | |
| core = ov.Core() | |
| tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True) | |
| ov_model = OVModelForCausalLM.from_pretrained( | |
| int4_model_dir, | |
| device=device.value, | |
| ov_config=ov_config, | |
| config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True), | |
| trust_remote_code=True | |
| ) | |
| return tok, ov_model | |
| # Gradio sliders for model generation parameters | |
| temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature") | |
| top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P") | |
| top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K") | |
| repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty") | |
| # Conversation history state | |
| history = gr.State([]) | |
| # Function to generate responses based on model and input | |
| def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value): | |
| tok, ov_model = load_model(model_language_value, model_id_value) | |
| def convert_history_to_token(history): | |
| input_tokens = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids | |
| return input_tokens | |
| input_ids = convert_history_to_token(history) | |
| streamer = gr.Textbox.update() | |
| generate_kwargs = dict( | |
| input_ids=input_ids, | |
| max_new_tokens=256, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| repetition_penalty=repetition_penalty, | |
| streamer=streamer | |
| ) | |
| event = Event() | |
| def generate_and_signal_complete(): | |
| ov_model.generate(**generate_kwargs) | |
| event.set() | |
| t1 = Thread(target=generate_and_signal_complete) | |
| t1.start() | |
| partial_text = "" | |
| for new_text in streamer: | |
| partial_text += new_text | |
| history[-1][1] = partial_text | |
| yield history | |
| # Set up the interface with inputs and outputs | |
| iface = gr.Interface( | |
| fn=generate_response, | |
| inputs=[history, temperature, top_p, top_k, repetition_penalty, model_language, model_id], | |
| outputs=[gr.Textbox(label="Conversation History"), history], | |
| live=True, | |
| title="OpenVINO Chatbot" | |
| ) | |
| # Launch the Gradio app | |
| if __name__ == "__main__": | |
| iface.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860) | |