Update README.md
Browse files
README.md
CHANGED
|
@@ -47,9 +47,9 @@ You can reproduce the results above via `pip install lm-eval==0.4.3`
|
|
| 47 |
First, install the dependecies:
|
| 48 |
```
|
| 49 |
pip install git+https://github.com/mobiusml/hqq.git #master branch fix
|
| 50 |
-
pip install bitblas
|
| 51 |
```
|
| 52 |
-
Also, make sure you use at least torch `2.4.0` or the nightly build.
|
| 53 |
|
| 54 |
Then you can use the sample code below:
|
| 55 |
``` Python
|
|
@@ -65,7 +65,7 @@ from hqq.utils.generation_hf import HFGenerator
|
|
| 65 |
#model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
|
| 66 |
model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version
|
| 67 |
|
| 68 |
-
compute_dtype = torch.
|
| 69 |
cache_dir = '.'
|
| 70 |
model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
|
| 71 |
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
|
|
@@ -77,8 +77,8 @@ patch_linearlayers(model, patch_add_quant_config, quant_config)
|
|
| 77 |
###################################################
|
| 78 |
HQQLinear.set_backend(HQQBackend.PYTORCH)
|
| 79 |
#prepare_for_inference(model) #default backend
|
| 80 |
-
|
| 81 |
-
prepare_for_inference(model, backend="bitblas") #takes a while to init...
|
| 82 |
|
| 83 |
#Generate
|
| 84 |
###################################################
|
|
|
|
| 47 |
First, install the dependecies:
|
| 48 |
```
|
| 49 |
pip install git+https://github.com/mobiusml/hqq.git #master branch fix
|
| 50 |
+
pip install bitblas #if you use the bitblas backend
|
| 51 |
```
|
| 52 |
+
Also, make sure you use at least torch `2.4.0` or the nightly build with at least CUDA 12.1.
|
| 53 |
|
| 54 |
Then you can use the sample code below:
|
| 55 |
``` Python
|
|
|
|
| 65 |
#model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
|
| 66 |
model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version
|
| 67 |
|
| 68 |
+
compute_dtype = torch.bfloat16 #bfloat16 for torchao_int4, float16 for bitblas
|
| 69 |
cache_dir = '.'
|
| 70 |
model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
|
| 71 |
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
|
|
|
|
| 77 |
###################################################
|
| 78 |
HQQLinear.set_backend(HQQBackend.PYTORCH)
|
| 79 |
#prepare_for_inference(model) #default backend
|
| 80 |
+
prepare_for_inference(model, backend="torchao_int4")
|
| 81 |
+
#prepare_for_inference(model, backend="bitblas") #takes a while to init...
|
| 82 |
|
| 83 |
#Generate
|
| 84 |
###################################################
|