add

Files changed (4) hide show

config.json +0 -120
examples/m1.wav +0 -0
preprocessor_config.json +0 -13
pytorch_model.bin +0 -3

config.json DELETED Viewed

@@ -1,120 +0,0 @@
-{
-  "model_type": "xy_tokenizer",
-  "input_sample_rate": 16000,
-  "output_sample_rate": 24000,
-  "encoder_downsample_rate": 1280,
-  "decoder_upsample_rate": 1920,
-  "code_dim": 3072,
-  "params": {
-    "feature_extractor_kwargs": {
-      "chunk_length": 30,
-      "feature_size": 80,
-      "hop_length": 160,
-      "n_fft": 400,
-      "n_samples": 480000,
-      "nb_max_frames": 3000,
-      "padding_side": "right",
-      "padding_value": 0.0,
-      "sampling_rate": 16000,
-      "return_attention_mask": true,
-      "return_tensors": "pt"
-    },
-    "semantic_encoder_kwargs": {
-      "num_mel_bins": 80,
-      "sampling_rate": 16000,
-      "hop_length": 160,
-      "stride_size": 2,
-      "kernel_size": 3,
-      "d_model": 768,
-      "scale_embedding": false,
-      "max_audio_seconds": 30,
-      "encoder_layers": 12,
-      "encoder_attention_heads": 12,
-      "encoder_ffn_dim": 3072,
-      "activation_function": "gelu"
-    },
-    "semantic_encoder_adapter_kwargs": {
-      "input_dim": 768,
-      "output_dim": 768,
-      "d_model": 768,
-      "max_source_positions": 1500,
-      "encoder_layers": 4,
-      "encoder_attention_heads": 12,
-      "encoder_ffn_dim": 3072
-    },
-    "acoustic_encoder_kwargs": {
-      "num_mel_bins": 80,
-      "sampling_rate": 16000,
-      "hop_length": 160,
-      "stride_size": 2,
-      "kernel_size": 3,
-      "d_model": 768,
-      "scale_embedding": false,
-      "max_audio_seconds": 30,
-      "encoder_layers": 12,
-      "encoder_attention_heads": 12,
-      "encoder_ffn_dim": 3072,
-      "activation_function": "gelu"
-    },
-    "pre_rvq_adapter_kwargs": {
-      "input_dim": 1536,
-      "output_dim": 768,
-      "d_model": 768,
-      "max_source_positions": 1500,
-      "encoder_layers": 4,
-      "encoder_attention_heads": 12,
-      "encoder_ffn_dim": 3072
-    },
-    "downsample_kwargs": {
-      "d_model": 768,
-      "avg_pooler": 4
-    },
-    "quantizer_kwargs": {
-      "input_dim": 3072,
-      "rvq_dim": 512,
-      "output_dim": 3072,
-      "num_quantizers": 8,
-      "codebook_size": 1024,
-      "codebook_dim": 512,
-      "quantizer_dropout": 0.0
-    },
-    "post_rvq_adapter_kwargs": {
-      "input_dim": 3072,
-      "output_dim": 3072,
-      "d_model": 768,
-      "max_source_positions": 375,
-      "encoder_layers": 4,
-      "encoder_attention_heads": 12,
-      "encoder_ffn_dim": 3072
-    },
-    "upsample_kwargs": {
-      "d_model": 768,
-      "stride": 4
-    },
-    "acoustic_decoder_kwargs": {
-      "num_mel_bins": 80,
-      "sampling_rate": 16000,
-      "hop_length": 160,
-      "stride_size": 2,
-      "kernel_size": 3,
-      "d_model": 768,
-      "scale_embedding": false,
-      "max_audio_seconds": 30,
-      "decoder_layers": 12,
-      "decoder_attention_heads": 12,
-      "decoder_ffn_dim": 3072,
-      "activation_function": "gelu"
-    },
-    "vocos_kwargs": {
-      "input_channels": 80,
-      "dim": 512,
-      "intermediate_dim": 4096,
-      "num_layers": 30,
-      "n_fft": 960,
-      "hop_size": 240,
-      "padding": "same"
-    }
-  },
-  "torch_dtype": "float32",
-  "transformers_version": "4.51.0"
-}

examples/m1.wav DELETED Viewed

Binary file (64.8 kB)

preprocessor_config.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "chunk_length": 30,
-  "feature_size": 80,
-  "hop_length": 160,
-  "n_fft": 400,
-  "n_samples": 480000,
-  "nb_max_frames": 3000,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "sampling_rate": 16000,
-  "return_attention_mask": true,
-  "return_tensors": "pt"
-}

pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fafbaf4ba0e6095be842230c4bd16ecf6d193b250718a5775f1ac7aa528d9110
-size 2137279502