Spaces:

lightmate
/

llm-chatbot

Runtime error

App Files Files Community

lightmate commited on Nov 6, 2024

Commit

fb42888

verified ·

1 Parent(s): 6fbaf70

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -61

app.py CHANGED Viewed

@@ -4,81 +4,44 @@ from transformers import AutoTokenizer, AutoConfig
 from optimum.intel.openvino import OVModelForCausalLM
 import openvino as ov
 import gradio as gr
 from gradio_helper import make_demo
 from llm_config import SUPPORTED_LLM_MODELS
-from pathlib import Path
 # Define model configuration
-model_language = "English"  # Example: set to English
-model_id = "qwen2.5-0.5b-instruct"  # Example model ID
-# Define model directories
-pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"]
-int4_model_dir = Path(model_id) / "INT4_compressed_weights"
-# Load tokenizer
 tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
-# Ensure INT4 weights exist; if not, attempt conversion (locally before deployment)
-def check_and_convert_model():
-    if not (int4_model_dir / "openvino_model.xml").exists():
-        print("INT4 model weights not found. Attempting compression...")
-        convert_to_int4()
-def convert_to_int4():
-    """
-    Converts a model to INT4 precision using the optimum-cli tool.
-    This function should only be run locally or in an environment that supports shell commands.
-    """
-    # Define compression parameters
-    compression_configs = {
-        "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
-        "default": {"sym": False, "group_size": 128, "ratio": 0.8},
-    }
-    model_compression_params = compression_configs.get(model_id, compression_configs["default"])
-    # Check if the INT4 model already exists
-    if (int4_model_dir / "openvino_model.xml").exists():
-        print("INT4 model already exists.")
-        return  # Exit if the model is already converted
-    # Run model compression using `optimum-cli`
-    export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4"
-    int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
-    if model_compression_params["sym"]:
-        int4_compression_args += " --sym"
-    # You can add other custom compression arguments here (like AWQ)
-    export_command = export_command_base + int4_compression_args
-    print(f"Running compression command: {export_command}")
-    # Execute the export command (this is typically done locally, not in Hugging Face Spaces)
-    # For deployment, the model needs to be pre-compressed and uploaded
-    os.system(export_command)
-# Check if the INT4 model exists or needs conversion
-check_and_convert_model()
-# Initialize OpenVINO model
-core = ov.Core()
 ov_model = OVModelForCausalLM.from_pretrained(
-    str(int4_model_dir),
-    device="CPU",  # Adjust device as needed (e.g., "GPU" or "CPU")
-    config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True),
     trust_remote_code=True,
 )
-def convert_history_to_token(history):
     """
-    Convert the history of the conversation into tokens for the model.
     """
-    input_ids = tok.encode(history[-1][0])  # Example tokenization
     return torch.LongTensor([input_ids])
 def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
     """
-    Bot logic to process conversation history and generate responses.
     """
     input_ids = convert_history_to_token(history)
     streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
@@ -93,16 +56,19 @@ def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id)
         streamer=streamer,
     )
-    # Generate response
     ov_model.generate(**generate_kwargs)
-    # Stream and update history with generated response
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         history[-1][1] = partial_text
         yield history
-# Gradio interface setup
-demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en")
 demo.launch(debug=True, share=True)

 from optimum.intel.openvino import OVModelForCausalLM
 import openvino as ov
 import gradio as gr
+from typing import List, Tuple
+from threading import Event, Thread
 from gradio_helper import make_demo
 from llm_config import SUPPORTED_LLM_MODELS
 # Define model configuration
+model_language = "English"  # For example, set the model language to English
+model_id = "qwen2.5-0.5b-instruct"  # For example, select a model ID
+# Load model configuration
+model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
+pt_model_id = model_configuration["model_id"]
+int4_model_dir = os.path.join(model_id, "INT4_compressed_weights")
+# Load the OpenVINO model and tokenizer
+device = "CPU"  # Or GPU if available
+core = ov.Core()
+model_name = model_configuration["model_id"]
 tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
+# Load the OpenVINO model
 ov_model = OVModelForCausalLM.from_pretrained(
+    int4_model_dir,
+    device=device,
+    config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
     trust_remote_code=True,
 )
+def convert_history_to_token(history: List[Tuple[str, str]]):
     """
+    Converts conversation history to tokens based on model configuration.
     """
+    input_ids = tok.encode(history[-1][0])  # Simple example for tokenizing the last user input.
     return torch.LongTensor([input_ids])
 def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
     """
+    Generates the next part of the conversation.
     """
     input_ids = convert_history_to_token(history)
     streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
         streamer=streamer,
     )
+    # Generation process
     ov_model.generate(**generate_kwargs)
+    # Stream and update history
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         history[-1][1] = partial_text
         yield history
+def request_cancel():
+    ov_model.request.cancel()
+# Gradio UI
+demo = make_demo(run_fn=bot, stop_fn=request_cancel, title="OpenVINO Chatbot", language="en")
 demo.launch(debug=True, share=True)