Merge branch 'main' of hf.co:Qwen/Qwen3-0.6B-FP8
Browse files- README.md +23 -32
 - added_tokens.json +0 -28
 - special_tokens_map.json +0 -31
 
    	
        README.md
    CHANGED
    
    | 
         @@ -1,4 +1,16 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1 | 
         
             
            # Qwen3-0.6B-FP8
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 2 | 
         | 
| 3 | 
         
             
            ## Qwen3 Highlights
         
     | 
| 4 | 
         | 
| 
         @@ -83,53 +95,32 @@ print("thinking content:", thinking_content) 
     | 
|
| 83 | 
         
             
            print("content:", content)
         
     | 
| 84 | 
         
             
            ```
         
     | 
| 85 | 
         | 
| 86 | 
         
            -
            For deployment, you can use ` 
     | 
| 87 | 
         
            -
            -  
     | 
| 88 | 
         
             
                ```shell
         
     | 
| 89 | 
         
            -
                 
     | 
| 90 | 
         
             
                ```
         
     | 
| 91 | 
         
            -
            -  
     | 
| 92 | 
         
             
                ```shell
         
     | 
| 93 | 
         
            -
                 
     | 
| 94 | 
         
             
                ```
         
     | 
| 95 | 
         | 
| 
         | 
|
| 
         | 
|
| 96 | 
         
             
            ## Note on FP8
         
     | 
| 97 | 
         | 
| 98 | 
         
             
            For convenience and performance, we have provided `fp8`-quantized model checkpoint for Qwen3, whose name ends with `-FP8`. The quantization method is fine-grained `fp8` quantization with block size of 128. You can find more details in the `quantization_config` field in `config.json`.
         
     | 
| 99 | 
         | 
| 100 | 
         
            -
            You can use the Qwen3-0.6B-FP8 model with serveral inference frameworks, including `transformers`, ` 
     | 
| 101 | 
         
             
            However, please pay attention to the following known issues:
         
     | 
| 102 | 
         
             
            - `transformers`:
         
     | 
| 103 | 
         
             
                - there are currently issues with the "fine-grained fp8" method in `transformers` for distributed inference. You may need to set the environment variable `CUDA_LAUNCH_BLOCKING=1` if multiple devices are used in inference.
         
     | 
| 104 | 
         
            -
            - vLLM:
         
     | 
| 105 | 
         
            -
                - there are currently compatibility issues with `vllm`. For a quick fix, you should make the following changes to `vllm/vllm/model_executor/layers/linear.py`:
         
     | 
| 106 | 
         
            -
                    ```python
         
     | 
| 107 | 
         
            -
                    # these changes are in QKVParallelLinear.weight_loader_v2() of vllm/vllm/model_executor/layers/linear.py
         
     | 
| 108 | 
         
            -
                    ...
         
     | 
| 109 | 
         
            -
                    shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
         
     | 
| 110 | 
         
            -
                    shard_size = self._get_shard_size_mapping(loaded_shard_id)
         
     | 
| 111 | 
         
            -
             
     | 
| 112 | 
         
            -
                    # add the following code
         
     | 
| 113 | 
         
            -
                    if isinstance(param, BlockQuantScaleParameter):
         
     | 
| 114 | 
         
            -
                        weight_block_size = self.quant_method.quant_config.weight_block_size
         
     | 
| 115 | 
         
            -
                        block_n, _ = weight_block_size[0], weight_block_size[1]
         
     | 
| 116 | 
         
            -
                        shard_offset = (shard_offset + block_n - 1) // block_n
         
     | 
| 117 | 
         
            -
                        shard_size = (shard_size + block_n - 1) // block_n
         
     | 
| 118 | 
         
            -
                    # end of the modification
         
     | 
| 119 | 
         
            -
             
     | 
| 120 | 
         
            -
                    param.load_qkv_weight(loaded_weight=loaded_weight,
         
     | 
| 121 | 
         
            -
                                            num_heads=self.num_kv_head_replicas,
         
     | 
| 122 | 
         
            -
                                            shard_id=loaded_shard_id,
         
     | 
| 123 | 
         
            -
                                            shard_offset=shard_offset,
         
     | 
| 124 | 
         
            -
                                            shard_size=shard_size)
         
     | 
| 125 | 
         
            -
                    ...
         
     | 
| 126 | 
         
            -
                    ```
         
     | 
| 127 | 
         | 
| 128 | 
         
             
            ## Switching Between Thinking and Non-Thinking Mode
         
     | 
| 129 | 
         | 
| 130 | 
         
             
            > [!TIP]
         
     | 
| 131 | 
         
            -
            > The `enable_thinking` switch is also available in APIs created by  
     | 
| 132 | 
         
            -
            > Please refer to our documentation for [ 
     | 
| 133 | 
         | 
| 134 | 
         
             
            ### `enable_thinking=True`
         
     | 
| 135 | 
         | 
| 
         @@ -227,7 +218,7 @@ if __name__ == "__main__": 
     | 
|
| 227 | 
         
             
                print(f"Bot: {response_3}")
         
     | 
| 228 | 
         
             
            ```
         
     | 
| 229 | 
         | 
| 230 | 
         
            -
            >  
     | 
| 231 | 
         
             
            > For API compatibility, when `enable_thinking=True`, regardless of whether the user uses `/think` or `/no_think`, the model will always output a block wrapped in `<think>...</think>`. However, the content inside this block may be empty if thinking is disabled.
         
     | 
| 232 | 
         
             
            > When `enable_thinking=False`, the soft switches are not valid. Regardless of any `/think` or `/no_think` tags input by the user, the model will not generate think content and will not include a `<think>...</think>` block.
         
     | 
| 233 | 
         | 
| 
         | 
|
| 1 | 
         
            +
            ---
         
     | 
| 2 | 
         
            +
            library_name: transformers
         
     | 
| 3 | 
         
            +
            license: apache-2.0
         
     | 
| 4 | 
         
            +
            license_link: https://huggingface.co/Qwen/Qwen3-0.6B-FP8/blob/main/LICENSE
         
     | 
| 5 | 
         
            +
            pipeline_tag: text-generation
         
     | 
| 6 | 
         
            +
            base_model:
         
     | 
| 7 | 
         
            +
            - Qwen/Qwen3-0.6B
         
     | 
| 8 | 
         
            +
            ---
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
             
            # Qwen3-0.6B-FP8
         
     | 
| 11 | 
         
            +
            <a href="https://chat.qwen.ai/" target="_blank" style="margin: 2px;">
         
     | 
| 12 | 
         
            +
                <img alt="Chat" src="https://img.shields.io/badge/%F0%9F%92%9C%EF%B8%8F%20Qwen%20Chat%20-536af5" style="display: inline-block; vertical-align: middle;"/>
         
     | 
| 13 | 
         
            +
            </a>
         
     | 
| 14 | 
         | 
| 15 | 
         
             
            ## Qwen3 Highlights
         
     | 
| 16 | 
         | 
| 
         | 
|
| 95 | 
         
             
            print("content:", content)
         
     | 
| 96 | 
         
             
            ```
         
     | 
| 97 | 
         | 
| 98 | 
         
            +
            For deployment, you can use `sglang>=0.4.6.post1` or `vllm>=0.8.5` or to create an OpenAI-compatible API endpoint:
         
     | 
| 99 | 
         
            +
            - SGLang:
         
     | 
| 100 | 
         
             
                ```shell
         
     | 
| 101 | 
         
            +
                python -m sglang.launch_server --model-path Qwen/Qwen3-0.6B-FP8 --reasoning-parser qwen3
         
     | 
| 102 | 
         
             
                ```
         
     | 
| 103 | 
         
            +
            - vLLM:
         
     | 
| 104 | 
         
             
                ```shell
         
     | 
| 105 | 
         
            +
                vllm serve Qwen/Qwen3-0.6B-FP8 --enable-reasoning --reasoning-parser deepseek_r1
         
     | 
| 106 | 
         
             
                ```
         
     | 
| 107 | 
         | 
| 108 | 
         
            +
            For local use, applications such as Ollama, LMStudio, MLX-LM, llama.cpp, and KTransformers have also supported Qwen3.
         
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
             
            ## Note on FP8
         
     | 
| 111 | 
         | 
| 112 | 
         
             
            For convenience and performance, we have provided `fp8`-quantized model checkpoint for Qwen3, whose name ends with `-FP8`. The quantization method is fine-grained `fp8` quantization with block size of 128. You can find more details in the `quantization_config` field in `config.json`.
         
     | 
| 113 | 
         | 
| 114 | 
         
            +
            You can use the Qwen3-0.6B-FP8 model with serveral inference frameworks, including `transformers`, `sglang`, and `vllm`, as the original bfloat16 model.
         
     | 
| 115 | 
         
             
            However, please pay attention to the following known issues:
         
     | 
| 116 | 
         
             
            - `transformers`:
         
     | 
| 117 | 
         
             
                - there are currently issues with the "fine-grained fp8" method in `transformers` for distributed inference. You may need to set the environment variable `CUDA_LAUNCH_BLOCKING=1` if multiple devices are used in inference.
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 118 | 
         | 
| 119 | 
         
             
            ## Switching Between Thinking and Non-Thinking Mode
         
     | 
| 120 | 
         | 
| 121 | 
         
             
            > [!TIP]
         
     | 
| 122 | 
         
            +
            > The `enable_thinking` switch is also available in APIs created by SGLang and vLLM. 
         
     | 
| 123 | 
         
            +
            > Please refer to our documentation for [SGLang](https://qwen.readthedocs.io/en/latest/deployment/sglang.html#thinking-non-thinking-modes) and [vLLM](https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes) users.
         
     | 
| 124 | 
         | 
| 125 | 
         
             
            ### `enable_thinking=True`
         
     | 
| 126 | 
         | 
| 
         | 
|
| 218 | 
         
             
                print(f"Bot: {response_3}")
         
     | 
| 219 | 
         
             
            ```
         
     | 
| 220 | 
         | 
| 221 | 
         
            +
            > [!NOTE]
         
     | 
| 222 | 
         
             
            > For API compatibility, when `enable_thinking=True`, regardless of whether the user uses `/think` or `/no_think`, the model will always output a block wrapped in `<think>...</think>`. However, the content inside this block may be empty if thinking is disabled.
         
     | 
| 223 | 
         
             
            > When `enable_thinking=False`, the soft switches are not valid. Regardless of any `/think` or `/no_think` tags input by the user, the model will not generate think content and will not include a `<think>...</think>` block.
         
     | 
| 224 | 
         | 
    	
        added_tokens.json
    DELETED
    
    | 
         @@ -1,28 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            {
         
     | 
| 2 | 
         
            -
              "</think>": 151668,
         
     | 
| 3 | 
         
            -
              "</tool_call>": 151658,
         
     | 
| 4 | 
         
            -
              "</tool_response>": 151666,
         
     | 
| 5 | 
         
            -
              "<think>": 151667,
         
     | 
| 6 | 
         
            -
              "<tool_call>": 151657,
         
     | 
| 7 | 
         
            -
              "<tool_response>": 151665,
         
     | 
| 8 | 
         
            -
              "<|box_end|>": 151649,
         
     | 
| 9 | 
         
            -
              "<|box_start|>": 151648,
         
     | 
| 10 | 
         
            -
              "<|endoftext|>": 151643,
         
     | 
| 11 | 
         
            -
              "<|file_sep|>": 151664,
         
     | 
| 12 | 
         
            -
              "<|fim_middle|>": 151660,
         
     | 
| 13 | 
         
            -
              "<|fim_pad|>": 151662,
         
     | 
| 14 | 
         
            -
              "<|fim_prefix|>": 151659,
         
     | 
| 15 | 
         
            -
              "<|fim_suffix|>": 151661,
         
     | 
| 16 | 
         
            -
              "<|im_end|>": 151645,
         
     | 
| 17 | 
         
            -
              "<|im_start|>": 151644,
         
     | 
| 18 | 
         
            -
              "<|image_pad|>": 151655,
         
     | 
| 19 | 
         
            -
              "<|object_ref_end|>": 151647,
         
     | 
| 20 | 
         
            -
              "<|object_ref_start|>": 151646,
         
     | 
| 21 | 
         
            -
              "<|quad_end|>": 151651,
         
     | 
| 22 | 
         
            -
              "<|quad_start|>": 151650,
         
     | 
| 23 | 
         
            -
              "<|repo_name|>": 151663,
         
     | 
| 24 | 
         
            -
              "<|video_pad|>": 151656,
         
     | 
| 25 | 
         
            -
              "<|vision_end|>": 151653,
         
     | 
| 26 | 
         
            -
              "<|vision_pad|>": 151654,
         
     | 
| 27 | 
         
            -
              "<|vision_start|>": 151652
         
     | 
| 28 | 
         
            -
            }
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        special_tokens_map.json
    DELETED
    
    | 
         @@ -1,31 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            {
         
     | 
| 2 | 
         
            -
              "additional_special_tokens": [
         
     | 
| 3 | 
         
            -
                "<|im_start|>",
         
     | 
| 4 | 
         
            -
                "<|im_end|>",
         
     | 
| 5 | 
         
            -
                "<|object_ref_start|>",
         
     | 
| 6 | 
         
            -
                "<|object_ref_end|>",
         
     | 
| 7 | 
         
            -
                "<|box_start|>",
         
     | 
| 8 | 
         
            -
                "<|box_end|>",
         
     | 
| 9 | 
         
            -
                "<|quad_start|>",
         
     | 
| 10 | 
         
            -
                "<|quad_end|>",
         
     | 
| 11 | 
         
            -
                "<|vision_start|>",
         
     | 
| 12 | 
         
            -
                "<|vision_end|>",
         
     | 
| 13 | 
         
            -
                "<|vision_pad|>",
         
     | 
| 14 | 
         
            -
                "<|image_pad|>",
         
     | 
| 15 | 
         
            -
                "<|video_pad|>"
         
     | 
| 16 | 
         
            -
              ],
         
     | 
| 17 | 
         
            -
              "eos_token": {
         
     | 
| 18 | 
         
            -
                "content": "<|im_end|>",
         
     | 
| 19 | 
         
            -
                "lstrip": false,
         
     | 
| 20 | 
         
            -
                "normalized": false,
         
     | 
| 21 | 
         
            -
                "rstrip": false,
         
     | 
| 22 | 
         
            -
                "single_word": false
         
     | 
| 23 | 
         
            -
              },
         
     | 
| 24 | 
         
            -
              "pad_token": {
         
     | 
| 25 | 
         
            -
                "content": "<|endoftext|>",
         
     | 
| 26 | 
         
            -
                "lstrip": false,
         
     | 
| 27 | 
         
            -
                "normalized": false,
         
     | 
| 28 | 
         
            -
                "rstrip": false,
         
     | 
| 29 | 
         
            -
                "single_word": false
         
     | 
| 30 | 
         
            -
              }
         
     | 
| 31 | 
         
            -
            }
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         |