Spaces:

Intel
/

low_bit_open_llm_leaderboard

Running

App Files Files Community

lvkaokao commited on May 15, 2024

Commit

b10d6d4

1 Parent(s): 228e920

support aqlm and gptq 2/3 bits.

Browse files

Files changed (4) hide show

src/display/utils.py +15 -2
src/leaderboard/read_evals.py +2 -3
src/submission/check_validity.py +11 -4
src/submission/submit.py +15 -0

src/display/utils.py CHANGED Viewed

@@ -204,6 +204,7 @@ class WeightType(Enum):
 class QuantType(Enum):
     gptq = ModelDetails(name="GPTQ", symbol="🟢")
     awq = ModelDetails(name="AWQ", symbol="🟩")
     llama_cpp = ModelDetails(name="llama.cpp", symbol="🔶")
     bnb = ModelDetails(name="bitsandbytes", symbol="💬")
@@ -216,6 +217,8 @@ class QuantType(Enum):
     def from_str(quant_dtype):
         if quant_dtype in ["GPTQ"]:
             return QuantType.gptq
         if quant_dtype in ["AWQ"]:
             return QuantType.awq
         if quant_dtype in ["llama.cpp"]:
@@ -228,6 +231,8 @@ class QuantType(Enum):
 class WeightDtype(Enum):
     int4 = ModelDetails("int4")
     nf4 = ModelDetails("nf4")
     fp4 = ModelDetails("fp4")
@@ -235,6 +240,10 @@ class WeightDtype(Enum):
     Unknown = ModelDetails("?")
     def from_str(weight_dtype):
         if weight_dtype in ["int4"]:
             return WeightDtype.int4
         if weight_dtype in ["nf4"]:
@@ -290,6 +299,8 @@ class GroupDtype(Enum):
 class Precision(Enum):
     # float16 = ModelDetails("float16")
     # bfloat16 = ModelDetails("bfloat16")
     qt_4bit = ModelDetails("4bit")
     # qt_8bit = ModelDetails("8bit")
     # qt_GPTQ = ModelDetails("GPTQ")
@@ -300,8 +311,10 @@ class Precision(Enum):
         #     return Precision.float16
         # if precision in ["torch.bfloat16", "bfloat16"]:
         #     return Precision.bfloat16
-        if precision in ["8bit"]:
-            return Precision.qt_8bit
         if precision in ["4bit"]:
             return Precision.qt_4bit
         # if precision in ["GPTQ", "None"]:

 class QuantType(Enum):
     gptq = ModelDetails(name="GPTQ", symbol="🟢")
+    aqlm = ModelDetails(name="AQLM", symbol="⭐")
     awq = ModelDetails(name="AWQ", symbol="🟩")
     llama_cpp = ModelDetails(name="llama.cpp", symbol="🔶")
     bnb = ModelDetails(name="bitsandbytes", symbol="💬")
     def from_str(quant_dtype):
         if quant_dtype in ["GPTQ"]:
             return QuantType.gptq
+        if quant_dtype in ["AQLM"]:
+            return QuantType.aqlm
         if quant_dtype in ["AWQ"]:
             return QuantType.awq
         if quant_dtype in ["llama.cpp"]:
 class WeightDtype(Enum):
+    int2 = ModelDetails("int2")
+    int3 = ModelDetails("int3")
     int4 = ModelDetails("int4")
     nf4 = ModelDetails("nf4")
     fp4 = ModelDetails("fp4")
     Unknown = ModelDetails("?")
     def from_str(weight_dtype):
+        if weight_dtype in ["int2"]:
+            return WeightDtype.int2
+        if weight_dtype in ["int3"]:
+            return WeightDtype.int3
         if weight_dtype in ["int4"]:
             return WeightDtype.int4
         if weight_dtype in ["nf4"]:
 class Precision(Enum):
     # float16 = ModelDetails("float16")
     # bfloat16 = ModelDetails("bfloat16")
+    qt_2bit = ModelDetails("2bit")
+    qt_3bit = ModelDetails("3bit")
     qt_4bit = ModelDetails("4bit")
     # qt_8bit = ModelDetails("8bit")
     # qt_GPTQ = ModelDetails("GPTQ")
         #     return Precision.float16
         # if precision in ["torch.bfloat16", "bfloat16"]:
         #     return Precision.bfloat16
+        if precision in ["2bit"]:
+            return Precision.qt_2bit
+        if precision in ["3bit"]:
+            return Precision.qt_3bit
         if precision in ["4bit"]:
             return Precision.qt_4bit
         # if precision in ["GPTQ", "None"]:

src/leaderboard/read_evals.py CHANGED Viewed

@@ -54,8 +54,7 @@ class EvalResult:
         # Precision
         precision = Precision.from_str(config.get("precision", "4bit"))
         quant_type = QuantType.from_str(config.get("quant_type", "GPTQ"))
-        # not use
-        weight_dtype = WeightDtype.from_str(config.get("weight_dtype", "int4"))
         compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
         double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
         model_params = config["model_params"]
@@ -243,7 +242,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         if eval_result.full_model in dynamic_data:
-            eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
             # Hardcoding because of gating problem
             if "meta-llama" in eval_result.full_model:
                 eval_result.still_on_hub = True

         # Precision
         precision = Precision.from_str(config.get("precision", "4bit"))
         quant_type = QuantType.from_str(config.get("quant_type", "GPTQ"))
+        weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
         compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
         double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
         model_params = config["model_params"]
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         if eval_result.full_model in dynamic_data:
+            # eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
             # Hardcoding because of gating problem
             if "meta-llama" in eval_result.full_model:
                 eval_result.still_on_hub = True

src/submission/check_validity.py CHANGED Viewed

@@ -92,18 +92,22 @@ def get_model_size(model_info: ModelInfo, precision: str):
     return model_size
 KNOWN_SIZE_FACTOR = {
-    "gptq": {"4bit": 8, "8bit": 4},
     "awq": {"4bit": 8},
-    "bitsandbytes": {"4bit": 2}
 }
 BYTES = {
     "I32": 4,
     "F16": 2,
     "BF16": 2,
     "F32": 4,
     "U8": 1}
 def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method="", bits="4bit"):
     try:
         safetensors = get_safetensors_metadata(model_info.id)
@@ -111,9 +115,12 @@ def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method=""
         mem = 0
         for key in safetensors.parameter_count:
             mem += safetensors.parameter_count[key] * BYTES[key]
-            if key in ["I32", "U8"]:
-                num_parameters += safetensors.parameter_count[key] * KNOWN_SIZE_FACTOR[quant_method][bits]
         params_b = round(num_parameters / 1e9, 2)
         size_gb = round(mem / 1e9,2)
         return params_b, size_gb

     return model_size
 KNOWN_SIZE_FACTOR = {
+    "gptq": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 12},
     "awq": {"4bit": 8},
+    "bitsandbytes": {"4bit": 2},
+    "aqlm": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 6},
 }
 BYTES = {
     "I32": 4,
+    "I16": 2,
+    "I8": 1,
     "F16": 2,
     "BF16": 2,
     "F32": 4,
     "U8": 1}
 def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method="", bits="4bit"):
     try:
         safetensors = get_safetensors_metadata(model_info.id)
         mem = 0
         for key in safetensors.parameter_count:
             mem += safetensors.parameter_count[key] * BYTES[key]
+            if key in ["I32", "U8", "I16", "I8"]:
+                param = safetensors.parameter_count[key] * KNOWN_SIZE_FACTOR[quant_method][bits]
+                if key == "I8":
+                    param = param / 2
+                num_parameters += param
         params_b = round(num_parameters / 1e9, 2)
         size_gb = round(mem / 1e9,2)
         return params_b, size_gb

src/submission/submit.py CHANGED Viewed

@@ -140,6 +140,21 @@ def add_new_eval(
             hardware = "gpu"
             quant_type = "AWQ"
             precision = f"{quantization_config.get('bits', '4bit')}bit"
     if quant_type is None or quant_type == "":
         return styled_error("Please select a quantization model like GPTQ, AWQ etc.")

             hardware = "gpu"
             quant_type = "AWQ"
             precision = f"{quantization_config.get('bits', '4bit')}bit"
+        if quant_method == "aqlm":
+            hardware = "gpu"
+            quant_type = "AQLM"
+            nbits_per_codebook = quantization_config.get('nbits_per_codebook')
+            num_codebooks = quantization_config.get('num_codebooks')
+            in_group_size = quantization_config.get('in_group_size')
+            bits = int(nbits_per_codebook * num_codebooks / in_group_size)
+            precision = f"{bits}bit"
+    if precision == "4bit":
+        weight_dtype = "int4"
+    elif precision == "3bit":
+        weight_dtype = "int3"
+    elif precision == "2bit":
+        weight_dtype = "int2"
     if quant_type is None or quant_type == "":
         return styled_error("Please select a quantization model like GPTQ, AWQ etc.")