Update README.md
Browse files
README.md
CHANGED
|
@@ -28,14 +28,14 @@ pip install torchao
|
|
| 28 |
Then we can serve with the following command:
|
| 29 |
```Shell
|
| 30 |
# Server
|
| 31 |
-
export MODEL=
|
| 32 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
|
| 33 |
```
|
| 34 |
|
| 35 |
```Shell
|
| 36 |
# Client
|
| 37 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
| 38 |
-
"model": "
|
| 39 |
"messages": [
|
| 40 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
| 41 |
],
|
|
@@ -64,7 +64,7 @@ Example:
|
|
| 64 |
import torch
|
| 65 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 66 |
|
| 67 |
-
model_name = "
|
| 68 |
|
| 69 |
# load the tokenizer and the model
|
| 70 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
@@ -234,7 +234,7 @@ lm_eval --model hf --model_args pretrained=microsoft/Phi-4-mini-instruct --tasks
|
|
| 234 |
|
| 235 |
## AWQ-INT4
|
| 236 |
```Shell
|
| 237 |
-
export MODEL=
|
| 238 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
| 239 |
```
|
| 240 |
</details>
|
|
@@ -263,7 +263,7 @@ import torch
|
|
| 263 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
| 264 |
|
| 265 |
# use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-AWQ-INT4"
|
| 266 |
-
model_id = "
|
| 267 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
| 268 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 269 |
|
|
@@ -343,7 +343,7 @@ python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model
|
|
| 343 |
|
| 344 |
### AWQ-INT4
|
| 345 |
```Shell
|
| 346 |
-
export MODEL=
|
| 347 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 348 |
```
|
| 349 |
|
|
@@ -379,13 +379,13 @@ python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --
|
|
| 379 |
### AWQ-INT4
|
| 380 |
Server:
|
| 381 |
```Shell
|
| 382 |
-
export MODEL=
|
| 383 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
|
| 384 |
```
|
| 385 |
|
| 386 |
Client:
|
| 387 |
```Shell
|
| 388 |
-
export MODEL=
|
| 389 |
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
| 390 |
```
|
| 391 |
</details>
|
|
|
|
| 28 |
Then we can serve with the following command:
|
| 29 |
```Shell
|
| 30 |
# Server
|
| 31 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
| 32 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
|
| 33 |
```
|
| 34 |
|
| 35 |
```Shell
|
| 36 |
# Client
|
| 37 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
| 38 |
+
"model": "pytorch/Phi-4-mini-instruct-AWQ-INT4",
|
| 39 |
"messages": [
|
| 40 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
| 41 |
],
|
|
|
|
| 64 |
import torch
|
| 65 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 66 |
|
| 67 |
+
model_name = "pytorch/Phi-4-mini-instruct-AWQ-INT4"
|
| 68 |
|
| 69 |
# load the tokenizer and the model
|
| 70 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
| 234 |
|
| 235 |
## AWQ-INT4
|
| 236 |
```Shell
|
| 237 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
| 238 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
| 239 |
```
|
| 240 |
</details>
|
|
|
|
| 263 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
| 264 |
|
| 265 |
# use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-AWQ-INT4"
|
| 266 |
+
model_id = "pytorch/Phi-4-mini-instruct-AWQ-INT4"
|
| 267 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
| 268 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 269 |
|
|
|
|
| 343 |
|
| 344 |
### AWQ-INT4
|
| 345 |
```Shell
|
| 346 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
| 347 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
| 348 |
```
|
| 349 |
|
|
|
|
| 379 |
### AWQ-INT4
|
| 380 |
Server:
|
| 381 |
```Shell
|
| 382 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
| 383 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
|
| 384 |
```
|
| 385 |
|
| 386 |
Client:
|
| 387 |
```Shell
|
| 388 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
| 389 |
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
| 390 |
```
|
| 391 |
</details>
|