lvkaokao
commited on
Commit
·
b10d6d4
1
Parent(s):
228e920
support aqlm and gptq 2/3 bits.
Browse files- src/display/utils.py +15 -2
- src/leaderboard/read_evals.py +2 -3
- src/submission/check_validity.py +11 -4
- src/submission/submit.py +15 -0
src/display/utils.py
CHANGED
|
@@ -204,6 +204,7 @@ class WeightType(Enum):
|
|
| 204 |
|
| 205 |
class QuantType(Enum):
|
| 206 |
gptq = ModelDetails(name="GPTQ", symbol="🟢")
|
|
|
|
| 207 |
awq = ModelDetails(name="AWQ", symbol="🟩")
|
| 208 |
llama_cpp = ModelDetails(name="llama.cpp", symbol="🔶")
|
| 209 |
bnb = ModelDetails(name="bitsandbytes", symbol="💬")
|
|
@@ -216,6 +217,8 @@ class QuantType(Enum):
|
|
| 216 |
def from_str(quant_dtype):
|
| 217 |
if quant_dtype in ["GPTQ"]:
|
| 218 |
return QuantType.gptq
|
|
|
|
|
|
|
| 219 |
if quant_dtype in ["AWQ"]:
|
| 220 |
return QuantType.awq
|
| 221 |
if quant_dtype in ["llama.cpp"]:
|
|
@@ -228,6 +231,8 @@ class QuantType(Enum):
|
|
| 228 |
|
| 229 |
|
| 230 |
class WeightDtype(Enum):
|
|
|
|
|
|
|
| 231 |
int4 = ModelDetails("int4")
|
| 232 |
nf4 = ModelDetails("nf4")
|
| 233 |
fp4 = ModelDetails("fp4")
|
|
@@ -235,6 +240,10 @@ class WeightDtype(Enum):
|
|
| 235 |
Unknown = ModelDetails("?")
|
| 236 |
|
| 237 |
def from_str(weight_dtype):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
if weight_dtype in ["int4"]:
|
| 239 |
return WeightDtype.int4
|
| 240 |
if weight_dtype in ["nf4"]:
|
|
@@ -290,6 +299,8 @@ class GroupDtype(Enum):
|
|
| 290 |
class Precision(Enum):
|
| 291 |
# float16 = ModelDetails("float16")
|
| 292 |
# bfloat16 = ModelDetails("bfloat16")
|
|
|
|
|
|
|
| 293 |
qt_4bit = ModelDetails("4bit")
|
| 294 |
# qt_8bit = ModelDetails("8bit")
|
| 295 |
# qt_GPTQ = ModelDetails("GPTQ")
|
|
@@ -300,8 +311,10 @@ class Precision(Enum):
|
|
| 300 |
# return Precision.float16
|
| 301 |
# if precision in ["torch.bfloat16", "bfloat16"]:
|
| 302 |
# return Precision.bfloat16
|
| 303 |
-
if precision in ["
|
| 304 |
-
return Precision.
|
|
|
|
|
|
|
| 305 |
if precision in ["4bit"]:
|
| 306 |
return Precision.qt_4bit
|
| 307 |
# if precision in ["GPTQ", "None"]:
|
|
|
|
| 204 |
|
| 205 |
class QuantType(Enum):
|
| 206 |
gptq = ModelDetails(name="GPTQ", symbol="🟢")
|
| 207 |
+
aqlm = ModelDetails(name="AQLM", symbol="⭐")
|
| 208 |
awq = ModelDetails(name="AWQ", symbol="🟩")
|
| 209 |
llama_cpp = ModelDetails(name="llama.cpp", symbol="🔶")
|
| 210 |
bnb = ModelDetails(name="bitsandbytes", symbol="💬")
|
|
|
|
| 217 |
def from_str(quant_dtype):
|
| 218 |
if quant_dtype in ["GPTQ"]:
|
| 219 |
return QuantType.gptq
|
| 220 |
+
if quant_dtype in ["AQLM"]:
|
| 221 |
+
return QuantType.aqlm
|
| 222 |
if quant_dtype in ["AWQ"]:
|
| 223 |
return QuantType.awq
|
| 224 |
if quant_dtype in ["llama.cpp"]:
|
|
|
|
| 231 |
|
| 232 |
|
| 233 |
class WeightDtype(Enum):
|
| 234 |
+
int2 = ModelDetails("int2")
|
| 235 |
+
int3 = ModelDetails("int3")
|
| 236 |
int4 = ModelDetails("int4")
|
| 237 |
nf4 = ModelDetails("nf4")
|
| 238 |
fp4 = ModelDetails("fp4")
|
|
|
|
| 240 |
Unknown = ModelDetails("?")
|
| 241 |
|
| 242 |
def from_str(weight_dtype):
|
| 243 |
+
if weight_dtype in ["int2"]:
|
| 244 |
+
return WeightDtype.int2
|
| 245 |
+
if weight_dtype in ["int3"]:
|
| 246 |
+
return WeightDtype.int3
|
| 247 |
if weight_dtype in ["int4"]:
|
| 248 |
return WeightDtype.int4
|
| 249 |
if weight_dtype in ["nf4"]:
|
|
|
|
| 299 |
class Precision(Enum):
|
| 300 |
# float16 = ModelDetails("float16")
|
| 301 |
# bfloat16 = ModelDetails("bfloat16")
|
| 302 |
+
qt_2bit = ModelDetails("2bit")
|
| 303 |
+
qt_3bit = ModelDetails("3bit")
|
| 304 |
qt_4bit = ModelDetails("4bit")
|
| 305 |
# qt_8bit = ModelDetails("8bit")
|
| 306 |
# qt_GPTQ = ModelDetails("GPTQ")
|
|
|
|
| 311 |
# return Precision.float16
|
| 312 |
# if precision in ["torch.bfloat16", "bfloat16"]:
|
| 313 |
# return Precision.bfloat16
|
| 314 |
+
if precision in ["2bit"]:
|
| 315 |
+
return Precision.qt_2bit
|
| 316 |
+
if precision in ["3bit"]:
|
| 317 |
+
return Precision.qt_3bit
|
| 318 |
if precision in ["4bit"]:
|
| 319 |
return Precision.qt_4bit
|
| 320 |
# if precision in ["GPTQ", "None"]:
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -54,8 +54,7 @@ class EvalResult:
|
|
| 54 |
# Precision
|
| 55 |
precision = Precision.from_str(config.get("precision", "4bit"))
|
| 56 |
quant_type = QuantType.from_str(config.get("quant_type", "GPTQ"))
|
| 57 |
-
|
| 58 |
-
weight_dtype = WeightDtype.from_str(config.get("weight_dtype", "int4"))
|
| 59 |
compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
|
| 60 |
double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
|
| 61 |
model_params = config["model_params"]
|
|
@@ -243,7 +242,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
| 243 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 244 |
eval_result.update_with_request_file(requests_path)
|
| 245 |
if eval_result.full_model in dynamic_data:
|
| 246 |
-
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
| 247 |
# Hardcoding because of gating problem
|
| 248 |
if "meta-llama" in eval_result.full_model:
|
| 249 |
eval_result.still_on_hub = True
|
|
|
|
| 54 |
# Precision
|
| 55 |
precision = Precision.from_str(config.get("precision", "4bit"))
|
| 56 |
quant_type = QuantType.from_str(config.get("quant_type", "GPTQ"))
|
| 57 |
+
weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
|
|
|
|
| 58 |
compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
|
| 59 |
double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
|
| 60 |
model_params = config["model_params"]
|
|
|
|
| 242 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 243 |
eval_result.update_with_request_file(requests_path)
|
| 244 |
if eval_result.full_model in dynamic_data:
|
| 245 |
+
# eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
| 246 |
# Hardcoding because of gating problem
|
| 247 |
if "meta-llama" in eval_result.full_model:
|
| 248 |
eval_result.still_on_hub = True
|
src/submission/check_validity.py
CHANGED
|
@@ -92,18 +92,22 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
| 92 |
return model_size
|
| 93 |
|
| 94 |
KNOWN_SIZE_FACTOR = {
|
| 95 |
-
"gptq": {"4bit": 8, "8bit": 4},
|
| 96 |
"awq": {"4bit": 8},
|
| 97 |
-
"bitsandbytes": {"4bit": 2}
|
|
|
|
| 98 |
}
|
| 99 |
|
| 100 |
BYTES = {
|
| 101 |
"I32": 4,
|
|
|
|
|
|
|
| 102 |
"F16": 2,
|
| 103 |
"BF16": 2,
|
| 104 |
"F32": 4,
|
| 105 |
"U8": 1}
|
| 106 |
|
|
|
|
| 107 |
def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method="", bits="4bit"):
|
| 108 |
try:
|
| 109 |
safetensors = get_safetensors_metadata(model_info.id)
|
|
@@ -111,9 +115,12 @@ def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method=""
|
|
| 111 |
mem = 0
|
| 112 |
for key in safetensors.parameter_count:
|
| 113 |
mem += safetensors.parameter_count[key] * BYTES[key]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
if key in ["I32", "U8"]:
|
| 116 |
-
num_parameters += safetensors.parameter_count[key] * KNOWN_SIZE_FACTOR[quant_method][bits]
|
| 117 |
params_b = round(num_parameters / 1e9, 2)
|
| 118 |
size_gb = round(mem / 1e9,2)
|
| 119 |
return params_b, size_gb
|
|
|
|
| 92 |
return model_size
|
| 93 |
|
| 94 |
KNOWN_SIZE_FACTOR = {
|
| 95 |
+
"gptq": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 12},
|
| 96 |
"awq": {"4bit": 8},
|
| 97 |
+
"bitsandbytes": {"4bit": 2},
|
| 98 |
+
"aqlm": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 6},
|
| 99 |
}
|
| 100 |
|
| 101 |
BYTES = {
|
| 102 |
"I32": 4,
|
| 103 |
+
"I16": 2,
|
| 104 |
+
"I8": 1,
|
| 105 |
"F16": 2,
|
| 106 |
"BF16": 2,
|
| 107 |
"F32": 4,
|
| 108 |
"U8": 1}
|
| 109 |
|
| 110 |
+
|
| 111 |
def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method="", bits="4bit"):
|
| 112 |
try:
|
| 113 |
safetensors = get_safetensors_metadata(model_info.id)
|
|
|
|
| 115 |
mem = 0
|
| 116 |
for key in safetensors.parameter_count:
|
| 117 |
mem += safetensors.parameter_count[key] * BYTES[key]
|
| 118 |
+
if key in ["I32", "U8", "I16", "I8"]:
|
| 119 |
+
param = safetensors.parameter_count[key] * KNOWN_SIZE_FACTOR[quant_method][bits]
|
| 120 |
+
if key == "I8":
|
| 121 |
+
param = param / 2
|
| 122 |
+
num_parameters += param
|
| 123 |
|
|
|
|
|
|
|
| 124 |
params_b = round(num_parameters / 1e9, 2)
|
| 125 |
size_gb = round(mem / 1e9,2)
|
| 126 |
return params_b, size_gb
|
src/submission/submit.py
CHANGED
|
@@ -140,6 +140,21 @@ def add_new_eval(
|
|
| 140 |
hardware = "gpu"
|
| 141 |
quant_type = "AWQ"
|
| 142 |
precision = f"{quantization_config.get('bits', '4bit')}bit"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
if quant_type is None or quant_type == "":
|
| 145 |
return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
|
|
|
|
| 140 |
hardware = "gpu"
|
| 141 |
quant_type = "AWQ"
|
| 142 |
precision = f"{quantization_config.get('bits', '4bit')}bit"
|
| 143 |
+
if quant_method == "aqlm":
|
| 144 |
+
hardware = "gpu"
|
| 145 |
+
quant_type = "AQLM"
|
| 146 |
+
nbits_per_codebook = quantization_config.get('nbits_per_codebook')
|
| 147 |
+
num_codebooks = quantization_config.get('num_codebooks')
|
| 148 |
+
in_group_size = quantization_config.get('in_group_size')
|
| 149 |
+
bits = int(nbits_per_codebook * num_codebooks / in_group_size)
|
| 150 |
+
precision = f"{bits}bit"
|
| 151 |
+
|
| 152 |
+
if precision == "4bit":
|
| 153 |
+
weight_dtype = "int4"
|
| 154 |
+
elif precision == "3bit":
|
| 155 |
+
weight_dtype = "int3"
|
| 156 |
+
elif precision == "2bit":
|
| 157 |
+
weight_dtype = "int2"
|
| 158 |
|
| 159 |
if quant_type is None or quant_type == "":
|
| 160 |
return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
|