Spaces:

OpenVINO
/

nncf-quantization

Running

App Files Files Community

nncf-quantization / app.py

katuni4ka

upgrade-optimum-intel (#2)

66e32bb verified 6 months ago

raw

history blame contribute delete

10.4 kB

	import os
	import shutil
	import gradio as gr
	from huggingface_hub import HfApi, whoami, ModelCard, model_info
	from gradio_huggingfacehub_search import HuggingfaceHubSearch
	from textwrap import dedent
	from pathlib import Path

	from tempfile import TemporaryDirectory

	from huggingface_hub.file_download import repo_folder_name
	from optimum.exporters import TasksManager
	from optimum.intel import (
	OVModelForAudioClassification,
	OVModelForCausalLM,
	OVModelForFeatureExtraction,
	OVModelForImageClassification,
	OVModelForMaskedLM,
	OVModelForQuestionAnswering,
	OVModelForSeq2SeqLM,
	OVModelForSequenceClassification,
	OVModelForTokenClassification,
	OVStableDiffusionPipeline,
	OVStableDiffusionXLPipeline,
	OVLatentConsistencyModelPipeline,
	OVWeightQuantizationConfig,
	)
	from diffusers import ConfigMixin

	_HEAD_TO_AUTOMODELS = {
	"feature-extraction": "OVModelForFeatureExtraction",
	"fill-mask": "OVModelForMaskedLM",
	"text-generation": "OVModelForCausalLM",
	"text-classification": "OVModelForSequenceClassification",
	"token-classification": "OVModelForTokenClassification",
	"question-answering": "OVModelForQuestionAnswering",
	"image-classification": "OVModelForImageClassification",
	"audio-classification": "OVModelForAudioClassification",
	"stable-diffusion": "OVStableDiffusionPipeline",
	"stable-diffusion-xl": "OVStableDiffusionXLPipeline",
	"latent-consistency": "OVLatentConsistencyModelPipeline",
	}

	def quantize_model(
	model_id: str,
	dtype: str,
	calibration_dataset: str,
	ratio: str,
	private_repo: bool,
	overwritte: bool,
	oauth_token: gr.OAuthToken,
	):
	if oauth_token.token is None:
	return "You must be logged in to use this space"

	if not model_id:
	return f"### Invalid input 🐞 Please specify a model name, got {model_id}"

	try:
	model_name = model_id.split("/")[-1]
	username = whoami(oauth_token.token)["name"]
	w_t = dtype.replace("-", "")
	suffix = f"{w_t}" if model_name.endswith("openvino") else f"openvino-{w_t}"
	new_repo_id = f"{username}/{model_name}-{suffix}"
	library_name = TasksManager.infer_library_from_model(model_id, token=oauth_token.token)

	if library_name == "diffusers":
	ConfigMixin.config_name = "model_index.json"
	class_name = ConfigMixin.load_config(model_id, token=oauth_token.token)["_class_name"].lower()
	if "xl" in class_name:
	task = "stable-diffusion-xl"
	elif "consistency" in class_name:
	task = "latent-consistency"
	else:
	task = "stable-diffusion"
	else:
	task = TasksManager.infer_task_from_model(model_id, token=oauth_token.token)

	if task == "text2text-generation":
	return "Export of Seq2Seq models is currently disabled."

	if task not in _HEAD_TO_AUTOMODELS:
	return f"The task '{task}' is not supported, only {_HEAD_TO_AUTOMODELS.keys()} tasks are supported"

	auto_model_class = _HEAD_TO_AUTOMODELS[task]
	if calibration_dataset == "None":
	calibration_dataset = None

	is_int8 = dtype == "8-bit"
	# if library_name == "diffusers":
	# quant_method = "hybrid"
	if not is_int8 and calibration_dataset is not None:
	quant_method = "awq"
	else:
	if calibration_dataset is not None:
	print("Default quantization was selected, calibration dataset won't be used")
	quant_method = "default"

	quantization_config = OVWeightQuantizationConfig(
	bits=8 if is_int8 else 4,
	quant_method=quant_method,
	dataset=None if quant_method=="default" else calibration_dataset,
	ratio=1.0 if is_int8 else ratio,
	num_samples=None if quant_method=="default" else 20,
	)

	api = HfApi(token=oauth_token.token)
	if api.repo_exists(new_repo_id) and not overwritte:
	return f"Model {new_repo_id} already exist, please tick the overwritte box to push on an existing repository"

	with TemporaryDirectory() as d:
	folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
	os.makedirs(folder)

	try:
	api.snapshot_download(repo_id=model_id, local_dir=folder, allow_patterns=["*.json"])
	ov_model = eval(auto_model_class).from_pretrained(
	model_id,
	cache_dir=folder,
	token=oauth_token.token,
	quantization_config=quantization_config
	)
	ov_model.save_pretrained(folder)
	new_repo_url = api.create_repo(repo_id=new_repo_id, exist_ok=True, private=private_repo)
	new_repo_id = new_repo_url.repo_id
	print("Repository created successfully!", new_repo_url)

	folder = Path(folder)
	for dir_name in (
	"",
	"vae_encoder",
	"vae_decoder",
	"text_encoder",
	"text_encoder_2",
	"unet",
	"tokenizer",
	"tokenizer_2",
	"scheduler",
	"feature_extractor",
	):
	if not (folder / dir_name).is_dir():
	continue
	for file_path in (folder / dir_name).iterdir():
	if file_path.is_file():
	try:
	api.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=os.path.join(dir_name, file_path.name),
	repo_id=new_repo_id,
	)
	except Exception as e:
	return f"Error uploading file {file_path}: {e}"

	try:
	card = ModelCard.load(model_id, token=oauth_token.token)
	except:
	card = ModelCard("")

	if card.data.tags is None:
	card.data.tags = []
	if "openvino" not in card.data.tags:
	card.data.tags.append("openvino")
	card.data.tags.append("nncf")
	card.data.tags.append(dtype)
	card.data.base_model = model_id

	card.text = dedent(
	f"""
	This model is a quantized version of [`{model_id}`](https://huggingface.co/{model_id}) and is converted to the OpenVINO format. This model was obtained via the [nncf-quantization](https://huggingface.co/spaces/echarlaix/nncf-quantization) space with [optimum-intel](https://github.com/huggingface/optimum-intel).

	First make sure you have `optimum-intel` installed:

	```bash
	pip install optimum[openvino]
	```

	To load your model you can do as follows:

	```python
	from optimum.intel import {auto_model_class}

	model_id = "{new_repo_id}"
	model = {auto_model_class}.from_pretrained(model_id)
	```
	"""
	)
	card_path = os.path.join(folder, "README.md")
	card.save(card_path)

	api.upload_file(
	path_or_fileobj=card_path,
	path_in_repo="README.md",
	repo_id=new_repo_id,
	)
	return f"This model was successfully quantized, find it under your repository {new_repo_url}"
	finally:
	shutil.rmtree(folder, ignore_errors=True)
	except Exception as e:
	return f"### Error: {e}"

	DESCRIPTION = """
	This Space uses [Optimum Intel](https://github.com/huggingface/optimum-intel) to automatically apply NNCF [Weight Only Quantization](https://huggingface.co/docs/optimum/main/en/intel/openvino/optimization) (WOQ) on your model and convert it to the [OpenVINO format](https://docs.openvino.ai/2024/documentation/openvino-ir-format.html) if not already.

	After conversion, a repository will be pushed under your namespace with the resulting model.

	The list of the supported architectures can be found in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/models)
	"""

	model_id = HuggingfaceHubSearch(
	label="Hub Model ID",
	placeholder="Search for model id on the hub",
	search_type="model",
	)
	dtype = gr.Dropdown(
	["8-bit", "4-bit"],
	value="8-bit",
	label="Weights precision",
	filterable=False,
	visible=True,
	)
	"""
	quant_method = gr.Dropdown(
	["default", "awq", "hybrid"],
	value="default",
	label="Quantization method",
	filterable=False,
	visible=True,
	)
	"""
	calibration_dataset = gr.Dropdown(
	[
	"None",
	"wikitext2",
	"c4",
	"c4-new",
	"conceptual_captions",
	"laion/220k-GPT4Vision-captions-from-LIVIS",
	"laion/filtered-wit",
	],
	value="None",
	label="Calibration dataset",
	filterable=False,
	visible=True,
	)
	ratio = gr.Slider(
	label="Ratio",
	info="Parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=1.0,
	)
	private_repo = gr.Checkbox(
	value=False,
	label="Private repository",
	info="Create a private repository instead of a public one",
	)
	overwritte = gr.Checkbox(
	value=False,
	label="Overwrite repository content",
	info="Enable pushing files on existing repositories, potentially overwriting existing files",
	)
	interface = gr.Interface(
	fn=quantize_model,
	inputs=[
	model_id,
	dtype,
	calibration_dataset,
	ratio,
	private_repo,
	overwritte,
	],
	outputs=[
	gr.Markdown(label="output"),
	],
	title="Quantize your model with NNCF",
	description=DESCRIPTION,
	api_name=False,
	)

	with gr.Blocks() as demo:
	gr.Markdown("You must be logged in to use this space")
	gr.LoginButton(min_width=250)
	interface.render()

	demo.launch()