Upload folder using huggingface_hub
Browse files- README.md +25 -13
- chat_template.jinja +25 -8
- config.json +4 -4
- configuration_midashenglm.py +4 -4
- generation_config.json +2 -2
- modeling_midashenglm.py +1 -1
- processing_midashenglm.py +46 -77
README.md
CHANGED
|
@@ -35,9 +35,7 @@ base_model_relation: finetune
|
|
| 35 |
- Python >= 3.9
|
| 36 |
- `transformers[torch]` >= 4.52
|
| 37 |
- `torchaudio`
|
| 38 |
-
|
| 39 |
-
> [!NOTE]
|
| 40 |
-
> You may need to install additional dependencies, e.g. ffmpeg, to load audio files with `torchaudio`.
|
| 41 |
|
| 42 |
## Usage
|
| 43 |
|
|
@@ -51,20 +49,34 @@ base_model_relation: finetune
|
|
| 51 |
>>> model.eval()
|
| 52 |
>>> processor = AutoProcessor.from_pretrained("zhoukz/MiDashengLM-HF-dev", trust_remote_code=True)
|
| 53 |
|
| 54 |
-
>>>
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
...
|
| 59 |
-
...
|
| 60 |
-
...
|
| 61 |
-
...
|
| 62 |
-
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
... ]
|
| 64 |
|
| 65 |
>>> import torch
|
| 66 |
>>> with torch.no_grad():
|
| 67 |
-
... model_inputs = processor(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
... generation = model.generate(**model_inputs)
|
| 69 |
... output = processor.batch_decode(generation, skip_special_tokens=True)
|
| 70 |
|
|
|
|
| 35 |
- Python >= 3.9
|
| 36 |
- `transformers[torch]` >= 4.52
|
| 37 |
- `torchaudio`
|
| 38 |
+
- `librosa`
|
|
|
|
|
|
|
| 39 |
|
| 40 |
## Usage
|
| 41 |
|
|
|
|
| 49 |
>>> model.eval()
|
| 50 |
>>> processor = AutoProcessor.from_pretrained("zhoukz/MiDashengLM-HF-dev", trust_remote_code=True)
|
| 51 |
|
| 52 |
+
>>> messages = [
|
| 53 |
+
... {
|
| 54 |
+
... "role": "system",
|
| 55 |
+
... "content": [
|
| 56 |
+
... {"type": "text", "text": "You are a helpful language and speech assistant."}
|
| 57 |
+
... ],
|
| 58 |
+
... },
|
| 59 |
+
... {
|
| 60 |
+
... "role": "user",
|
| 61 |
+
... "content": [
|
| 62 |
+
... {"type": "text", "text": "Caption the audio."},
|
| 63 |
+
... {
|
| 64 |
+
... "type": "audio",
|
| 65 |
+
... "path": "/path/to/audio.wav",
|
| 66 |
+
... },
|
| 67 |
+
... ],
|
| 68 |
+
... },
|
| 69 |
... ]
|
| 70 |
|
| 71 |
>>> import torch
|
| 72 |
>>> with torch.no_grad():
|
| 73 |
+
... model_inputs = processor.apply_chat_template(
|
| 74 |
+
... messages,
|
| 75 |
+
... tokenize=True,
|
| 76 |
+
... add_generation_prompt=True,
|
| 77 |
+
... add_special_tokens=True,
|
| 78 |
+
... return_dict=True,
|
| 79 |
+
... )
|
| 80 |
... generation = model.generate(**model_inputs)
|
| 81 |
... output = processor.batch_decode(generation, skip_special_tokens=True)
|
| 82 |
|
chat_template.jinja
CHANGED
|
@@ -1,8 +1,25 @@
|
|
| 1 |
-
{
|
| 2 |
-
|
| 3 |
-
{
|
| 4 |
-
{
|
| 5 |
-
{
|
| 6 |
-
{
|
| 7 |
-
{
|
| 8 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- for message in messages -%}
|
| 2 |
+
{%- if loop.first and message["role"] != "system" -%}
|
| 3 |
+
{{- "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" -}}
|
| 4 |
+
{%- endif -%}
|
| 5 |
+
{{- "<|im_start|>" -}}
|
| 6 |
+
{{- message["role"] -}}
|
| 7 |
+
{{- "\n" -}}
|
| 8 |
+
{%- if message["content"] is string -%}
|
| 9 |
+
{{- message["content"] -}}
|
| 10 |
+
{%- else -%}
|
| 11 |
+
{%- for content in message["content"] -%}
|
| 12 |
+
{%- if content["type"] == "text" -%}
|
| 13 |
+
{{- content["text"] -}}
|
| 14 |
+
{%- elif content["type"] == "audio" -%}
|
| 15 |
+
{{- "<|audio_bos|><|AUDIO|><|audio_eos|>" -}}
|
| 16 |
+
{%- endif -%}
|
| 17 |
+
{%- endfor -%}
|
| 18 |
+
{%- endif -%}
|
| 19 |
+
{%- if not loop.last or loop.last and not continue_final_message -%}
|
| 20 |
+
{{- "<|im_end|>\n" -}}
|
| 21 |
+
{%- endif -%}
|
| 22 |
+
{%- endfor -%}
|
| 23 |
+
{%- if add_generation_prompt -%}
|
| 24 |
+
{{- "<|im_start|>assistant\n" -}}
|
| 25 |
+
{%- endif -%}
|
config.json
CHANGED
|
@@ -14,7 +14,7 @@
|
|
| 14 |
"init_values": null,
|
| 15 |
"input_channels": 1,
|
| 16 |
"mlp_ratio": 4.0,
|
| 17 |
-
"model_type": "
|
| 18 |
"n_fft": 512,
|
| 19 |
"n_mels": 64,
|
| 20 |
"num_heads": 16,
|
|
@@ -36,8 +36,8 @@
|
|
| 36 |
"AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
|
| 37 |
"AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
|
| 38 |
},
|
| 39 |
-
"
|
| 40 |
-
"
|
| 41 |
"subsample_factor": 5,
|
| 42 |
"text_config": {
|
| 43 |
"attention_dropout": 0.0,
|
|
@@ -70,5 +70,5 @@
|
|
| 70 |
"vocab_size": 152064
|
| 71 |
},
|
| 72 |
"torch_dtype": "float32",
|
| 73 |
-
"transformers_version": "4.52.
|
| 74 |
}
|
|
|
|
| 14 |
"init_values": null,
|
| 15 |
"input_channels": 1,
|
| 16 |
"mlp_ratio": 4.0,
|
| 17 |
+
"model_type": "midashenglm_dasheng_encoder",
|
| 18 |
"n_fft": 512,
|
| 19 |
"n_mels": 64,
|
| 20 |
"num_heads": 16,
|
|
|
|
| 36 |
"AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
|
| 37 |
"AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
|
| 38 |
},
|
| 39 |
+
"lora_target_modules": "all-linear",
|
| 40 |
+
"model_type": "midashenglm",
|
| 41 |
"subsample_factor": 5,
|
| 42 |
"text_config": {
|
| 43 |
"attention_dropout": 0.0,
|
|
|
|
| 70 |
"vocab_size": 152064
|
| 71 |
},
|
| 72 |
"torch_dtype": "float32",
|
| 73 |
+
"transformers_version": "4.52.4"
|
| 74 |
}
|
configuration_midashenglm.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from ast import Dict
|
| 2 |
-
from typing import Tuple, Union
|
| 3 |
|
| 4 |
from transformers import PretrainedConfig
|
| 5 |
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
|
|
@@ -8,7 +8,7 @@ from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
|
|
| 8 |
|
| 9 |
|
| 10 |
class DashengConfig(PretrainedConfig):
|
| 11 |
-
model_type = "
|
| 12 |
|
| 13 |
def __init__(
|
| 14 |
self,
|
|
@@ -22,7 +22,7 @@ class DashengConfig(PretrainedConfig):
|
|
| 22 |
num_heads: int = 12,
|
| 23 |
mlp_ratio: float = 4.0,
|
| 24 |
qkv_bias: bool = True,
|
| 25 |
-
init_values: float
|
| 26 |
drop_rate: float = 0.0,
|
| 27 |
attn_drop_rate: float = 0.0,
|
| 28 |
f_min: float = 0.0,
|
|
@@ -60,7 +60,7 @@ class DashengConfig(PretrainedConfig):
|
|
| 60 |
|
| 61 |
|
| 62 |
class MiDashengLMConfig(PretrainedConfig):
|
| 63 |
-
model_type = "
|
| 64 |
|
| 65 |
def __init__(
|
| 66 |
self,
|
|
|
|
| 1 |
from ast import Dict
|
| 2 |
+
from typing import Optional, Tuple, Union
|
| 3 |
|
| 4 |
from transformers import PretrainedConfig
|
| 5 |
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
class DashengConfig(PretrainedConfig):
|
| 11 |
+
model_type = "midashenglm_dasheng_encoder"
|
| 12 |
|
| 13 |
def __init__(
|
| 14 |
self,
|
|
|
|
| 22 |
num_heads: int = 12,
|
| 23 |
mlp_ratio: float = 4.0,
|
| 24 |
qkv_bias: bool = True,
|
| 25 |
+
init_values: Optional[float] = None,
|
| 26 |
drop_rate: float = 0.0,
|
| 27 |
attn_drop_rate: float = 0.0,
|
| 28 |
f_min: float = 0.0,
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
class MiDashengLMConfig(PretrainedConfig):
|
| 63 |
+
model_type = "midashenglm"
|
| 64 |
|
| 65 |
def __init__(
|
| 66 |
self,
|
generation_config.json
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
151643,
|
| 4 |
151645
|
| 5 |
],
|
| 6 |
-
"max_length":
|
| 7 |
"pad_token_id": 151643,
|
| 8 |
-
"transformers_version": "4.52.
|
| 9 |
}
|
|
|
|
| 3 |
151643,
|
| 4 |
151645
|
| 5 |
],
|
| 6 |
+
"max_length": 32768,
|
| 7 |
"pad_token_id": 151643,
|
| 8 |
+
"transformers_version": "4.52.4"
|
| 9 |
}
|
modeling_midashenglm.py
CHANGED
|
@@ -172,7 +172,7 @@ class DashengBlock(nn.Module):
|
|
| 172 |
qkv_bias: bool = False,
|
| 173 |
drop: float = 0.0,
|
| 174 |
attn_drop: float = 0.0,
|
| 175 |
-
init_values: float
|
| 176 |
act_layer: Type[nn.Module] = nn.GELU,
|
| 177 |
norm_layer: Type[nn.Module] = nn.LayerNorm,
|
| 178 |
attention_type: Type[nn.Module] = DashengAttention,
|
|
|
|
| 172 |
qkv_bias: bool = False,
|
| 173 |
drop: float = 0.0,
|
| 174 |
attn_drop: float = 0.0,
|
| 175 |
+
init_values: Optional[float] = None,
|
| 176 |
act_layer: Type[nn.Module] = nn.GELU,
|
| 177 |
norm_layer: Type[nn.Module] = nn.LayerNorm,
|
| 178 |
attention_type: Type[nn.Module] = DashengAttention,
|
processing_midashenglm.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
-
|
| 3 |
-
from typing import List
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
import torch
|
|
@@ -51,13 +49,13 @@ class MiAudioLLMProcessor(ProcessorMixin):
|
|
| 51 |
|
| 52 |
def __init__(
|
| 53 |
self,
|
| 54 |
-
feature_extractor: Wav2Vec2FeatureExtractor
|
| 55 |
-
tokenizer: Qwen2Tokenizer
|
| 56 |
model_subsampling: int = 5,
|
| 57 |
-
chat_template: str
|
| 58 |
-
audio_token: str
|
| 59 |
-
audio_bos_token: str
|
| 60 |
-
audio_eos_token: str
|
| 61 |
):
|
| 62 |
assert tokenizer is not None, "Tokenizer Needs to be passed"
|
| 63 |
assert audio_token is not None or hasattr(tokenizer, "audio_token"), (
|
|
@@ -71,7 +69,7 @@ class MiAudioLLMProcessor(ProcessorMixin):
|
|
| 71 |
)
|
| 72 |
|
| 73 |
if chat_template is None:
|
| 74 |
-
chat_template =
|
| 75 |
|
| 76 |
self.audio_token: str = audio_token or tokenizer.audio_token
|
| 77 |
self.audio_bos_token = audio_bos_token or tokenizer.audio_bos_token
|
|
@@ -86,10 +84,39 @@ class MiAudioLLMProcessor(ProcessorMixin):
|
|
| 86 |
|
| 87 |
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def __call__(
|
| 90 |
self,
|
| 91 |
-
text: List[str]
|
| 92 |
-
audio: List[np.ndarray]
|
| 93 |
**kwargs: Unpack[MiAudioLLMProcessorKwargs],
|
| 94 |
) -> BatchFeature:
|
| 95 |
if text is None:
|
|
@@ -101,6 +128,12 @@ class MiAudioLLMProcessor(ProcessorMixin):
|
|
| 101 |
"Invalid input text. Please provide a string, or a list of strings"
|
| 102 |
)
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
output_kwargs = self._merge_kwargs(
|
| 105 |
MiAudioLLMProcessorKwargs,
|
| 106 |
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
|
@@ -108,16 +141,7 @@ class MiAudioLLMProcessor(ProcessorMixin):
|
|
| 108 |
)
|
| 109 |
|
| 110 |
if audio is not None:
|
| 111 |
-
|
| 112 |
-
audio = [sample_.numpy() for sample_ in audio]
|
| 113 |
-
|
| 114 |
-
if isinstance(audio[0], torch.Tensor):
|
| 115 |
-
audio = [sample_.squeeze(0) for sample_ in audio]
|
| 116 |
-
if not all(x_.ndim == 1 for x_ in audio):
|
| 117 |
-
raise ValueError("All samples in a list must be 1D.")
|
| 118 |
-
if isinstance(audio[0], np.ndarray):
|
| 119 |
-
if not all(x_.ndim == 1 for x_ in audio):
|
| 120 |
-
raise ValueError("All samples in a list must be 1D.")
|
| 121 |
# ensure we have as much audios as audio tokens
|
| 122 |
num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
|
| 123 |
num_audios = 1 if type(audio) is np.ndarray else len(audio)
|
|
@@ -223,58 +247,3 @@ class MiAudioLLMProcessor(ProcessorMixin):
|
|
| 223 |
tokenizer_input_names + feature_extractor_input_names + ["audio_length"]
|
| 224 |
)
|
| 225 |
)
|
| 226 |
-
|
| 227 |
-
@property
|
| 228 |
-
# NOTE: we don't have default templates anymore, and the below is kept only because the hub config is not yet updated!
|
| 229 |
-
def default_chat_template(self):
|
| 230 |
-
"""
|
| 231 |
-
This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
|
| 232 |
-
* the template will output the role of the speaker followed by the content of the message.
|
| 233 |
-
* content is a list of strings and audios.
|
| 234 |
-
* If the content element is an audio, the template will output a sequence of <|AUDIO|> tokens
|
| 235 |
-
|
| 236 |
-
Example:
|
| 237 |
-
|
| 238 |
-
```python
|
| 239 |
-
messages = [
|
| 240 |
-
{'role': 'system', 'content': 'You are a helpful assistant.'},
|
| 241 |
-
{"role": "user", "content": [
|
| 242 |
-
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
|
| 243 |
-
{"type": "text", "text": "What's that sound?"},
|
| 244 |
-
]},
|
| 245 |
-
{"role": "assistant", "content": "It is the sound of glass shattering."},
|
| 246 |
-
{"role": "user", "content": [
|
| 247 |
-
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
|
| 248 |
-
{"type": "text", "text": "How about this one?"},
|
| 249 |
-
]},
|
| 250 |
-
]
|
| 251 |
-
|
| 252 |
-
result = template.render(messages=messages, add_generation_prompt=True)
|
| 253 |
-
```
|
| 254 |
-
"""
|
| 255 |
-
# fmt: off
|
| 256 |
-
return (
|
| 257 |
-
"{% set audio_count = namespace(value=0) %}"
|
| 258 |
-
"{% for message in messages %}"
|
| 259 |
-
"{% if loop.first and message['role'] != 'system' %}"
|
| 260 |
-
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
| 261 |
-
"{% endif %}"
|
| 262 |
-
"<|im_start|>{{ message['role'] }}\n"
|
| 263 |
-
"{% if message['content'] is string %}"
|
| 264 |
-
"{{ message['content'] }}<|im_end|>\n"
|
| 265 |
-
"{% else %}"
|
| 266 |
-
"{% for content in message['content'] %}"
|
| 267 |
-
"{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}"
|
| 268 |
-
"{% set audio_count.value = audio_count.value + 1 %}"
|
| 269 |
-
"Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
| 270 |
-
"{% elif 'text' in content %}"
|
| 271 |
-
"{{ content['text'] }}"
|
| 272 |
-
"{% endif %}"
|
| 273 |
-
"{% endfor %}"
|
| 274 |
-
"<|im_end|>\n"
|
| 275 |
-
"{% endif %}"
|
| 276 |
-
"{% endfor %}"
|
| 277 |
-
"{% if add_generation_prompt %}"
|
| 278 |
-
"<|im_start|>assistant\n"
|
| 279 |
-
"{% endif %}"
|
| 280 |
-
)
|
|
|
|
| 1 |
+
from typing import List, Optional, Union
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
|
|
|
| 49 |
|
| 50 |
def __init__(
|
| 51 |
self,
|
| 52 |
+
feature_extractor: Optional[Wav2Vec2FeatureExtractor] = None,
|
| 53 |
+
tokenizer: Optional[Union[Qwen2Tokenizer, Qwen2TokenizerFast]] = None,
|
| 54 |
model_subsampling: int = 5,
|
| 55 |
+
chat_template: Optional[str] = None,
|
| 56 |
+
audio_token: Optional[str] = None,
|
| 57 |
+
audio_bos_token: Optional[str] = None,
|
| 58 |
+
audio_eos_token: Optional[str] = None,
|
| 59 |
):
|
| 60 |
assert tokenizer is not None, "Tokenizer Needs to be passed"
|
| 61 |
assert audio_token is not None or hasattr(tokenizer, "audio_token"), (
|
|
|
|
| 69 |
)
|
| 70 |
|
| 71 |
if chat_template is None:
|
| 72 |
+
chat_template = tokenizer.chat_template
|
| 73 |
|
| 74 |
self.audio_token: str = audio_token or tokenizer.audio_token
|
| 75 |
self.audio_bos_token = audio_bos_token or tokenizer.audio_bos_token
|
|
|
|
| 84 |
|
| 85 |
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
|
| 86 |
|
| 87 |
+
@classmethod
|
| 88 |
+
def _validate_audio_sample(
|
| 89 |
+
cls,
|
| 90 |
+
sample: Union[np.ndarray, torch.Tensor],
|
| 91 |
+
) -> np.ndarray:
|
| 92 |
+
if isinstance(sample, torch.Tensor):
|
| 93 |
+
if sample.ndim != 1:
|
| 94 |
+
raise ValueError("Audio tensor must be 1D.")
|
| 95 |
+
return sample.numpy()
|
| 96 |
+
if isinstance(sample, np.ndarray):
|
| 97 |
+
if sample.ndim != 1:
|
| 98 |
+
raise ValueError("Audio array must be 1D.")
|
| 99 |
+
return sample
|
| 100 |
+
if isinstance(sample, str):
|
| 101 |
+
# When passing audio paths through `apply_chat_template`, transformers
|
| 102 |
+
# will attempt to load the audio file, but only succeeds if the path
|
| 103 |
+
# is a valid URL (starting with http:// or https://) or an existing local
|
| 104 |
+
# file. Otherwise, the string is passed as-is. This captures that case and
|
| 105 |
+
# raises an error to inform the user.
|
| 106 |
+
raise TypeError(
|
| 107 |
+
"Expected audio to be a numpy array or torch tensor, but got a string. "
|
| 108 |
+
"If you passed audios through `apply_chat_template`, "
|
| 109 |
+
"make sure the audio paths are valid URLs starting with http:// or https://, "
|
| 110 |
+
"or existing local files."
|
| 111 |
+
)
|
| 112 |
+
raise TypeError(
|
| 113 |
+
f"Expected audio to be a numpy array, torch tensor, or string, but got {type(sample)}."
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
def __call__(
|
| 117 |
self,
|
| 118 |
+
text: Optional[List[str]] = None,
|
| 119 |
+
audio: Optional[Union[List[np.ndarray], List[torch.Tensor]]] = None,
|
| 120 |
**kwargs: Unpack[MiAudioLLMProcessorKwargs],
|
| 121 |
) -> BatchFeature:
|
| 122 |
if text is None:
|
|
|
|
| 128 |
"Invalid input text. Please provide a string, or a list of strings"
|
| 129 |
)
|
| 130 |
|
| 131 |
+
if (
|
| 132 |
+
kwargs.get("images", None) is not None
|
| 133 |
+
or kwargs.get("videos", None) is not None
|
| 134 |
+
):
|
| 135 |
+
raise ValueError("This model does not support images or videos.")
|
| 136 |
+
|
| 137 |
output_kwargs = self._merge_kwargs(
|
| 138 |
MiAudioLLMProcessorKwargs,
|
| 139 |
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
|
|
|
| 141 |
)
|
| 142 |
|
| 143 |
if audio is not None:
|
| 144 |
+
audio = [self._validate_audio_sample(sample) for sample in audio]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
# ensure we have as much audios as audio tokens
|
| 146 |
num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
|
| 147 |
num_audios = 1 if type(audio) is np.ndarray else len(audio)
|
|
|
|
| 247 |
tokenizer_input_names + feature_extractor_input_names + ["audio_length"]
|
| 248 |
)
|
| 249 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|