pytorch
/

Phi-4-mini-instruct-AWQ-INT4

@@ -28,14 +28,14 @@ pip install torchao
 Then we can serve with the following command:
 ```Shell
 # Server
-export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
 VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
 ```
 ```Shell
 # Client
 curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
-  "model": "jerryzh168/Phi-4-mini-instruct-AWQ-INT4",
   "messages": [
     {"role": "user", "content": "Give me a short introduction to large language models."}
   ],
@@ -64,7 +64,7 @@ Example:
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-model_name = "jerryzh168/Phi-4-mini-instruct-AWQ-INT4"
 # load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -234,7 +234,7 @@ lm_eval --model hf --model_args pretrained=microsoft/Phi-4-mini-instruct --tasks
 ## AWQ-INT4
 ```Shell
-export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
 lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
 ```
 </details>
@@ -263,7 +263,7 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
 # use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-AWQ-INT4"
-model_id = "jerryzh168/Phi-4-mini-instruct-AWQ-INT4"
 quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -343,7 +343,7 @@ python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model
 ### AWQ-INT4
 ```Shell
-export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
 VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
 ```
@@ -379,13 +379,13 @@ python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --
 ### AWQ-INT4
 Server:
 ```Shell
-export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
 VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
 ```
 Client:
 ```Shell
-export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
 python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
 ```
 </details>

 Then we can serve with the following command:
 ```Shell
 # Server
+export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
 VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
 ```
 ```Shell
 # Client
 curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "pytorch/Phi-4-mini-instruct-AWQ-INT4",
   "messages": [
     {"role": "user", "content": "Give me a short introduction to large language models."}
   ],
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "pytorch/Phi-4-mini-instruct-AWQ-INT4"
 # load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 ## AWQ-INT4
 ```Shell
+export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
 lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
 ```
 </details>
 from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
 # use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-AWQ-INT4"
+model_id = "pytorch/Phi-4-mini-instruct-AWQ-INT4"
 quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 ### AWQ-INT4
 ```Shell
+export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
 VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
 ```
 ### AWQ-INT4
 Server:
 ```Shell
+export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
 VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
 ```
 Client:
 ```Shell
+export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
 python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
 ```
 </details>