Spaces:
Runtime error
Runtime error
| import os | |
| import torch | |
| from transformers import AutoTokenizer, AutoConfig | |
| from optimum.intel.openvino import OVModelForCausalLM | |
| import openvino as ov | |
| import gradio as gr | |
| from gradio_helper import make_demo | |
| from llm_config import SUPPORTED_LLM_MODELS | |
| from pathlib import Path | |
| # Define model configuration | |
| model_language = "en" # Example: set to English | |
| model_id = "qwen2.5-0.5b-instruct" # Example model ID | |
| # Define model directories | |
| pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"] | |
| int4_model_dir = Path(model_id) / "INT4_compressed_weights" | |
| # Load tokenizer | |
| tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True) | |
| # Ensure INT4 weights exist; if not, attempt conversion (locally before deployment) | |
| def check_and_convert_model(): | |
| if not (int4_model_dir / "openvino_model.xml").exists(): | |
| print("INT4 model weights not found. Attempting compression...") | |
| convert_to_int4() | |
| def convert_to_int4(): | |
| """ | |
| Converts a model to INT4 precision using the optimum-cli tool. | |
| This function should only be run locally or in an environment that supports shell commands. | |
| """ | |
| # Define compression parameters | |
| compression_configs = { | |
| "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, | |
| "default": {"sym": False, "group_size": 128, "ratio": 0.8}, | |
| } | |
| model_compression_params = compression_configs.get(model_id, compression_configs["default"]) | |
| # Check if the INT4 model already exists | |
| if (int4_model_dir / "openvino_model.xml").exists(): | |
| print("INT4 model already exists.") | |
| return # Exit if the model is already converted | |
| # Run model compression using `optimum-cli` | |
| export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4" | |
| int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}" | |
| if model_compression_params["sym"]: | |
| int4_compression_args += " --sym" | |
| # You can add other custom compression arguments here (like AWQ) | |
| export_command = export_command_base + int4_compression_args | |
| print(f"Running compression command: {export_command}") | |
| # Execute the export command (this is typically done locally, not in Hugging Face Spaces) | |
| # For deployment, the model needs to be pre-compressed and uploaded | |
| os.system(export_command) | |
| # Check if the INT4 model exists or needs conversion | |
| check_and_convert_model() | |
| # Initialize OpenVINO model | |
| core = ov.Core() | |
| ov_model = OVModelForCausalLM.from_pretrained( | |
| str(int4_model_dir), | |
| device="CPU", # Adjust device as needed (e.g., "GPU" or "CPU") | |
| config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True), | |
| trust_remote_code=True, | |
| ) | |
| def convert_history_to_token(history): | |
| """ | |
| Convert the history of the conversation into tokens for the model. | |
| """ | |
| input_ids = tok.encode(history[-1][0]) # Example tokenization | |
| return torch.LongTensor([input_ids]) | |
| def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): | |
| """ | |
| Bot logic to process conversation history and generate responses. | |
| """ | |
| input_ids = convert_history_to_token(history) | |
| streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) | |
| generate_kwargs = dict( | |
| input_ids=input_ids, | |
| max_new_tokens=256, | |
| temperature=temperature, | |
| do_sample=temperature > 0.0, | |
| top_p=top_p, | |
| top_k=top_k, | |
| repetition_penalty=repetition_penalty, | |
| streamer=streamer, | |
| ) | |
| # Generate response | |
| ov_model.generate(**generate_kwargs) | |
| # Stream and update history with generated response | |
| partial_text = "" | |
| for new_text in streamer: | |
| partial_text += new_text | |
| history[-1][1] = partial_text | |
| yield history | |
| # Gradio interface setup | |
| demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en") | |
| demo.launch(debug=True, share=True) | |