Generation
Requires: https://github.com/vllm-project/llm-compressor/pull/1788
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"
# Configure the quantization algorithm to run.
recipe = [
    AWQModifier(
        duo_scaling=False,
        ignore=[
            "lm_head",
            "re:.*mlp.gate$",
            "re:.*mlp.shared_expert_gate$",
            "re:visual.*",
        ],
        scheme="W4A16",
        targets=["Linear"],
    ),
]
# Select calibration dataset.
DATASET_ID = "codeparrot/self-instruct-starcoder"
DATASET_SPLIT = "curated"
# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 2048
def get_calib_dataset(tokenizer):
    from datasets import load_dataset
    ds = load_dataset(
        DATASET_ID,
        split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]",
    )
    def preprocess(example):
        chat_messages = [
            {"role": "user", "content": example["instruction"].strip()},
            {"role": "assistant", "content": example["output"].strip()},
        ]
        tokenized_messages = tokenizer.apply_chat_template(
            chat_messages, tokenize=True
        )
        return {"input_ids": tokenized_messages}
    ds = (
        ds.shuffle(seed=42)
        .map(preprocess, remove_columns=ds.column_names)
        .select(range(NUM_CALIBRATION_SAMPLES))
    )
    return ds
if __name__ == "__main__":
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, torch_dtype="auto", trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    ###
    ### Apply algorithms.
    ###
    oneshot(
        model=model,
        dataset=get_calib_dataset(tokenizer),
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        log_dir=None,
        trust_remote_code_model=True,
    )
    model.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)
Evaluation
The model was evaluated on HumanEval and HumanEval+ benchmark with the Neural Magic fork of the EvalPlus implementation of HumanEval+ and the vLLM engine, using the following commands:
python evalplus/codegen/generate.py --model nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq --bs 16 --temperature 0.2 --n_samples 50 --root "./results" --dataset humaneval --backend vllm --dtype auto 
python evalplus/evalplus/sanitize.py results/humaneval/nm-testing--Qwen3-Coder-30B-A3B-Instruct-W4A16-awq_vllm_temp_0.2
evalplus.evaluate --dataset humaneval --samples results/humaneval/nm-testing--Qwen3-Coder-30B-A3B-Instruct-W4A16-awq_vllm_temp_0.2-sanitized
| Metric | Qwen/Qwen3-Coder-30B-A3B-Instruct | nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq | 
|---|---|---|
| HumanEval pass@1 | 93.0 | 93.7 | 
| HumanEval pass@10 | 93.9 | 94.5 | 
| HumanEval+ pass@1 | 88.7 | 89.3 | 
| HumanEval+ pass@10 | 89.8 | 90.2 | 
| Average Score | 91.35 | 91.93 | 
- Downloads last month
 - 6,845
 
	Inference Providers
	NEW
	
	
	This model isn't deployed by any Inference Provider.
	馃檵
			
		Ask for provider support
Model tree for nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq
Base model
Qwen/Qwen3-Coder-30B-A3B-Instruct