Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

README.md +25 -13
chat_template.jinja +25 -8
config.json +4 -4
configuration_midashenglm.py +4 -4
generation_config.json +2 -2
modeling_midashenglm.py +1 -1
processing_midashenglm.py +46 -77

README.md CHANGED Viewed

@@ -35,9 +35,7 @@ base_model_relation: finetune
 - Python >= 3.9
 - `transformers[torch]` >= 4.52
 - `torchaudio`
-> [!NOTE]
-> You may need to install additional dependencies, e.g. ffmpeg, to load audio files with `torchaudio`.
 ## Usage
@@ -51,20 +49,34 @@ base_model_relation: finetune
 >>> model.eval()
 >>> processor = AutoProcessor.from_pretrained("zhoukz/MiDashengLM-HF-dev", trust_remote_code=True)
->>> import torchaudio
->>> audio, sr = torchaudio.load("path/to/audio.wav")
->>> assert sr == 16000
->>> text = [
-...     "<|im_start|>system\\n"
-...     "You are a helpful language and speech assistant.<|im_end|>\\n"
-...     "<|im_start|>user\\nCaption the audio"
-...     "<|audio_bos|><|AUDIO|><|audio_eos|><|im_end|>\\n"
-...     "<|im_start|>assistant\\n'"
 ... ]
 >>> import torch
 >>> with torch.no_grad():
-...     model_inputs = processor(text=text, audio=audio, sampling_rate=sr))
 ...     generation = model.generate(**model_inputs)
 ...     output = processor.batch_decode(generation, skip_special_tokens=True)

 - Python >= 3.9
 - `transformers[torch]` >= 4.52
 - `torchaudio`
+- `librosa`
 ## Usage
 >>> model.eval()
 >>> processor = AutoProcessor.from_pretrained("zhoukz/MiDashengLM-HF-dev", trust_remote_code=True)
+>>> messages = [
+...     {
+...         "role": "system",
+...         "content": [
+...             {"type": "text", "text": "You are a helpful language and speech assistant."}
+...         ],
+...     },
+...     {
+...         "role": "user",
+...         "content": [
+...             {"type": "text", "text": "Caption the audio."},
+...             {
+...                 "type": "audio",
+...                 "path": "/path/to/audio.wav",
+...             },
+...         ],
+...     },
 ... ]
 >>> import torch
 >>> with torch.no_grad():
+...     model_inputs = processor.apply_chat_template(
+...         messages,
+...         tokenize=True,
+...         add_generation_prompt=True,
+...         add_special_tokens=True,
+...         return_dict=True,
+...     )
 ...     generation = model.generate(**model_inputs)
 ...     output = processor.batch_decode(generation, skip_special_tokens=True)

chat_template.jinja CHANGED Viewed

@@ -1,8 +1,25 @@
-{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
-You are a helpful assistant.<|im_end|>
-{% endif %}<|im_start|>{{ message['role'] }}
-{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
-{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>
-{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
-{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
-{% endif %}

+{%- for message in messages -%}
+	{%- if loop.first and message["role"] != "system" -%}
+		{{- "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" -}}
+	{%- endif -%}
+	{{- "<|im_start|>" -}}
+	{{- message["role"] -}}
+	{{- "\n" -}}
+	{%- if message["content"] is string -%}
+		{{- message["content"] -}}
+	{%- else -%}
+		{%- for content in message["content"] -%}
+			{%- if content["type"] == "text" -%}
+                {{- content["text"] -}}
+			{%- elif content["type"] == "audio" -%}
+				{{- "<|audio_bos|><|AUDIO|><|audio_eos|>" -}}
+			{%- endif -%}
+		{%- endfor -%}
+	{%- endif -%}
+    {%- if not loop.last or loop.last and not continue_final_message -%}
+        {{- "<|im_end|>\n" -}}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+	{{- "<|im_start|>assistant\n" -}}
+{%- endif -%}

config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "init_values": null,
     "input_channels": 1,
     "mlp_ratio": 4.0,
-    "model_type": "miaudiollm_dasheng_encoder",
     "n_fft": 512,
     "n_mels": 64,
     "num_heads": 16,
@@ -36,8 +36,8 @@
     "AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
     "AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
   },
-  "model_type": "miaudiollm",
-  "resize_tokenizer": false,
   "subsample_factor": 5,
   "text_config": {
     "attention_dropout": 0.0,
@@ -70,5 +70,5 @@
     "vocab_size": 152064
   },
   "torch_dtype": "float32",
-  "transformers_version": "4.52.0.dev0"
 }

     "init_values": null,
     "input_channels": 1,
     "mlp_ratio": 4.0,
+    "model_type": "midashenglm_dasheng_encoder",
     "n_fft": 512,
     "n_mels": 64,
     "num_heads": 16,
     "AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
     "AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
   },
+  "lora_target_modules": "all-linear",
+  "model_type": "midashenglm",
   "subsample_factor": 5,
   "text_config": {
     "attention_dropout": 0.0,
     "vocab_size": 152064
   },
   "torch_dtype": "float32",
+  "transformers_version": "4.52.4"
 }

configuration_midashenglm.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from ast import Dict
-from typing import Tuple, Union
 from transformers import PretrainedConfig
 from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
@@ -8,7 +8,7 @@ from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
 class DashengConfig(PretrainedConfig):
-    model_type = "miaudiollm_dasheng_encoder"
     def __init__(
         self,
@@ -22,7 +22,7 @@ class DashengConfig(PretrainedConfig):
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         qkv_bias: bool = True,
-        init_values: float | None = None,
         drop_rate: float = 0.0,
         attn_drop_rate: float = 0.0,
         f_min: float = 0.0,
@@ -60,7 +60,7 @@ class DashengConfig(PretrainedConfig):
 class MiDashengLMConfig(PretrainedConfig):
-    model_type = "miaudiollm"
     def __init__(
         self,

 from ast import Dict
+from typing import Optional, Tuple, Union
 from transformers import PretrainedConfig
 from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
 class DashengConfig(PretrainedConfig):
+    model_type = "midashenglm_dasheng_encoder"
     def __init__(
         self,
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         qkv_bias: bool = True,
+        init_values: Optional[float] = None,
         drop_rate: float = 0.0,
         attn_drop_rate: float = 0.0,
         f_min: float = 0.0,
 class MiDashengLMConfig(PretrainedConfig):
+    model_type = "midashenglm"
     def __init__(
         self,

generation_config.json CHANGED Viewed

@@ -3,7 +3,7 @@
     151643,
     151645
   ],
-  "max_length": 128,
   "pad_token_id": 151643,
-  "transformers_version": "4.52.0.dev0"
 }

     151643,
     151645
   ],
+  "max_length": 32768,
   "pad_token_id": 151643,
+  "transformers_version": "4.52.4"
 }

modeling_midashenglm.py CHANGED Viewed

@@ -172,7 +172,7 @@ class DashengBlock(nn.Module):
         qkv_bias: bool = False,
         drop: float = 0.0,
         attn_drop: float = 0.0,
-        init_values: float | None = None,
         act_layer: Type[nn.Module] = nn.GELU,
         norm_layer: Type[nn.Module] = nn.LayerNorm,
         attention_type: Type[nn.Module] = DashengAttention,

         qkv_bias: bool = False,
         drop: float = 0.0,
         attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
         act_layer: Type[nn.Module] = nn.GELU,
         norm_layer: Type[nn.Module] = nn.LayerNorm,
         attention_type: Type[nn.Module] = DashengAttention,

processing_midashenglm.py CHANGED Viewed

@@ -1,6 +1,4 @@
-from __future__ import annotations
-from typing import List
 import numpy as np
 import torch
@@ -51,13 +49,13 @@ class MiAudioLLMProcessor(ProcessorMixin):
     def __init__(
         self,
-        feature_extractor: Wav2Vec2FeatureExtractor | None = None,
-        tokenizer: Qwen2Tokenizer | Qwen2TokenizerFast | None = None,
         model_subsampling: int = 5,
-        chat_template: str | None = None,
-        audio_token: str | None = None,
-        audio_bos_token: str | None = None,
-        audio_eos_token: str | None = None,
     ):
         assert tokenizer is not None, "Tokenizer Needs to be passed"
         assert audio_token is not None or hasattr(tokenizer, "audio_token"), (
@@ -71,7 +69,7 @@ class MiAudioLLMProcessor(ProcessorMixin):
         )
         if chat_template is None:
-            chat_template = self.default_chat_template
         self.audio_token: str = audio_token or tokenizer.audio_token
         self.audio_bos_token = audio_bos_token or tokenizer.audio_bos_token
@@ -86,10 +84,39 @@ class MiAudioLLMProcessor(ProcessorMixin):
         super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
     def __call__(
         self,
-        text: List[str] | None = None,
-        audio: List[np.ndarray] | List[torch.Tensor] | None = None,
         **kwargs: Unpack[MiAudioLLMProcessorKwargs],
     ) -> BatchFeature:
         if text is None:
@@ -101,6 +128,12 @@ class MiAudioLLMProcessor(ProcessorMixin):
                 "Invalid input text. Please provide a string, or a list of strings"
             )
         output_kwargs = self._merge_kwargs(
             MiAudioLLMProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
@@ -108,16 +141,7 @@ class MiAudioLLMProcessor(ProcessorMixin):
         )
         if audio is not None:
-            if isinstance(audio[0], torch.Tensor):
-                audio = [sample_.numpy() for sample_ in audio]
-            if isinstance(audio[0], torch.Tensor):
-                audio = [sample_.squeeze(0) for sample_ in audio]
-                if not all(x_.ndim == 1 for x_ in audio):
-                    raise ValueError("All samples in a list must be 1D.")
-            if isinstance(audio[0], np.ndarray):
-                if not all(x_.ndim == 1 for x_ in audio):
-                    raise ValueError("All samples in a list must be 1D.")
             # ensure we have as much audios as audio tokens
             num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
             num_audios = 1 if type(audio) is np.ndarray else len(audio)
@@ -223,58 +247,3 @@ class MiAudioLLMProcessor(ProcessorMixin):
                 tokenizer_input_names + feature_extractor_input_names + ["audio_length"]
             )
         )
-    @property
-    # NOTE: we don't have default templates anymore, and the below is kept only because the hub config is not yet updated!
-    def default_chat_template(self):
-        """
-        This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
-        * the template will output the role of the speaker followed by the content of the message.
-        * content is a list of strings and audios.
-        * If the content element is an audio, the template will output a sequence of <|AUDIO|> tokens
-        Example:
-        ```python
-        messages = [
-            {'role': 'system', 'content': 'You are a helpful assistant.'},
-            {"role": "user", "content": [
-                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
-                {"type": "text", "text": "What's that sound?"},
-            ]},
-            {"role": "assistant", "content": "It is the sound of glass shattering."},
-            {"role": "user", "content": [
-                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
-                {"type": "text", "text": "How about this one?"},
-            ]},
-        ]
-        result = template.render(messages=messages, add_generation_prompt=True)
-        ```
-        """
-        # fmt: off
-        return (
-            "{% set audio_count = namespace(value=0) %}"
-            "{% for message in messages %}"
-                "{% if loop.first and message['role'] != 'system' %}"
-                    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-                "{% endif %}"
-                "<|im_start|>{{ message['role'] }}\n"
-                "{% if message['content'] is string %}"
-                    "{{ message['content'] }}<|im_end|>\n"
-                "{% else %}"
-                    "{% for content in message['content'] %}"
-                        "{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}"
-                            "{% set audio_count.value = audio_count.value + 1 %}"
-                            "Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
-                        "{% elif 'text' in content %}"
-                            "{{ content['text'] }}"
-                        "{% endif %}"
-                    "{% endfor %}"
-                    "<|im_end|>\n"
-                "{% endif %}"
-            "{% endfor %}"
-            "{% if add_generation_prompt %}"
-                "<|im_start|>assistant\n"
-            "{% endif %}"
-        )

+from typing import List, Optional, Union
 import numpy as np
 import torch
     def __init__(
         self,
+        feature_extractor: Optional[Wav2Vec2FeatureExtractor] = None,
+        tokenizer: Optional[Union[Qwen2Tokenizer, Qwen2TokenizerFast]] = None,
         model_subsampling: int = 5,
+        chat_template: Optional[str] = None,
+        audio_token: Optional[str] = None,
+        audio_bos_token: Optional[str] = None,
+        audio_eos_token: Optional[str] = None,
     ):
         assert tokenizer is not None, "Tokenizer Needs to be passed"
         assert audio_token is not None or hasattr(tokenizer, "audio_token"), (
         )
         if chat_template is None:
+            chat_template = tokenizer.chat_template
         self.audio_token: str = audio_token or tokenizer.audio_token
         self.audio_bos_token = audio_bos_token or tokenizer.audio_bos_token
         super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
+    @classmethod
+    def _validate_audio_sample(
+        cls,
+        sample: Union[np.ndarray, torch.Tensor],
+    ) -> np.ndarray:
+        if isinstance(sample, torch.Tensor):
+            if sample.ndim != 1:
+                raise ValueError("Audio tensor must be 1D.")
+            return sample.numpy()
+        if isinstance(sample, np.ndarray):
+            if sample.ndim != 1:
+                raise ValueError("Audio array must be 1D.")
+            return sample
+        if isinstance(sample, str):
+            # When passing audio paths through `apply_chat_template`, transformers
+            # will attempt to load the audio file, but only succeeds if the path
+            # is a valid URL (starting with http:// or https://) or an existing local
+            # file. Otherwise, the string is passed as-is. This captures that case and
+            # raises an error to inform the user.
+            raise TypeError(
+                "Expected audio to be a numpy array or torch tensor, but got a string. "
+                "If you passed audios through `apply_chat_template`, "
+                "make sure the audio paths are valid URLs starting with http:// or https://, "
+                "or existing local files."
+            )
+        raise TypeError(
+            f"Expected audio to be a numpy array, torch tensor, or string, but got {type(sample)}."
+        )
     def __call__(
         self,
+        text: Optional[List[str]] = None,
+        audio: Optional[Union[List[np.ndarray], List[torch.Tensor]]] = None,
         **kwargs: Unpack[MiAudioLLMProcessorKwargs],
     ) -> BatchFeature:
         if text is None:
                 "Invalid input text. Please provide a string, or a list of strings"
             )
+        if (
+            kwargs.get("images", None) is not None
+            or kwargs.get("videos", None) is not None
+        ):
+            raise ValueError("This model does not support images or videos.")
         output_kwargs = self._merge_kwargs(
             MiAudioLLMProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
         )
         if audio is not None:
+            audio = [self._validate_audio_sample(sample) for sample in audio]
             # ensure we have as much audios as audio tokens
             num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
             num_audios = 1 if type(audio) is np.ndarray else len(audio)
                 tokenizer_input_names + feature_extractor_input_names + ["audio_length"]
             )
         )