Added Model
Browse files- README.md +160 -0
- config.yml +74 -0
- events.out.tfevents.1712388040.bookbot.343075.0.v2 +3 -0
- model.h5 +3 -0
- processor.json +1 -0
    	
        README.md
    CHANGED
    
    | @@ -1,3 +1,163 @@ | |
| 1 | 
             
            ---
         | 
|  | |
| 2 | 
             
            license: cc-by-sa-4.0
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 3 | 
             
            ---
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            language: sw
         | 
| 3 | 
             
            license: cc-by-sa-4.0
         | 
| 4 | 
            +
            tags:
         | 
| 5 | 
            +
              - tensorflowtts
         | 
| 6 | 
            +
              - audio
         | 
| 7 | 
            +
              - text-to-speech
         | 
| 8 | 
            +
              - text-to-mel
         | 
| 9 | 
            +
            inference: false
         | 
| 10 | 
             
            ---
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            # LightSpeech MFA SW v1
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            LightSpeech MFA SW v1 is a text-to-mel-spectrogram model based on the [LightSpeech](https://arxiv.org/abs/2102.04040) architecture. This model was trained from scratch on a real audio dataset. The list of real speakers include:
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            - sw-KE-OpenBible
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            We trained an acoustic Swahili model on our speech corpus using [Montreal Forced Aligner v2.0.0](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) and used it as the duration extractor. That model, and consequently our model, uses the IPA phone set for Swahili. We used [gruut](https://github.com/rhasspy/gruut) for phonemization purposes. We followed these [steps](https://github.com/TensorSpeech/TensorFlowTTS/tree/master/examples/mfa_extraction) to perform duration extraction.
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            This model was trained using the [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) framework. All training was done on a Scaleway RENDER-S VM with a Tesla P100 GPU. All necessary scripts used for training could be found in this [Github Fork](https://github.com/bookbot-hive/TensorFlowTTS), as well as the [Training metrics](https://huggingface.co/bookbot/lightspeech-mfa-sw-v1/tensorboard) logged via Tensorboard.
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            ## Model
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            | Model                   | Config                                                                            | SR (Hz) | Mel range (Hz) | FFT / Hop / Win (pt) | #steps |
         | 
| 25 | 
            +
            | ----------------------- | --------------------------------------------------------------------------------- | ------- | -------------- | -------------------- | ------ |
         | 
| 26 | 
            +
            | `lightspeech-mfa-sw-v1` | [Link](https://huggingface.co/bookbot/lightspeech-mfa-sw-v1/blob/main/config.yml) | 44.1K   | 20-11025       | 2048 / 512 / None    | 200K   |
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            ## Training Procedure
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            <details>
         | 
| 31 | 
            +
              <summary>Feature Extraction Setting</summary>
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                hop_size: 512 # Hop size.
         | 
| 34 | 
            +
                format: "npy"
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            </details>
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            <details>
         | 
| 39 | 
            +
              <summary>Network Architecture Setting</summary>
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                model_type: lightspeech
         | 
| 42 | 
            +
                lightspeech_params:
         | 
| 43 | 
            +
                    dataset: "swahiliipa"
         | 
| 44 | 
            +
                    n_speakers: 1
         | 
| 45 | 
            +
                    encoder_hidden_size: 256
         | 
| 46 | 
            +
                    encoder_num_hidden_layers: 3
         | 
| 47 | 
            +
                    encoder_num_attention_heads: 2
         | 
| 48 | 
            +
                    encoder_attention_head_size: 16
         | 
| 49 | 
            +
                    encoder_intermediate_size: 1024
         | 
| 50 | 
            +
                    encoder_intermediate_kernel_size:
         | 
| 51 | 
            +
                        - 5
         | 
| 52 | 
            +
                        - 25
         | 
| 53 | 
            +
                        - 13
         | 
| 54 | 
            +
                        - 9
         | 
| 55 | 
            +
                    encoder_hidden_act: "mish"
         | 
| 56 | 
            +
                    decoder_hidden_size: 256
         | 
| 57 | 
            +
                    decoder_num_hidden_layers: 3
         | 
| 58 | 
            +
                    decoder_num_attention_heads: 2
         | 
| 59 | 
            +
                    decoder_attention_head_size: 16
         | 
| 60 | 
            +
                    decoder_intermediate_size: 1024
         | 
| 61 | 
            +
                    decoder_intermediate_kernel_size:
         | 
| 62 | 
            +
                        - 17
         | 
| 63 | 
            +
                        - 21
         | 
| 64 | 
            +
                        - 9
         | 
| 65 | 
            +
                        - 13
         | 
| 66 | 
            +
                    decoder_hidden_act: "mish"
         | 
| 67 | 
            +
                    variant_prediction_num_conv_layers: 2
         | 
| 68 | 
            +
                    variant_predictor_filter: 256
         | 
| 69 | 
            +
                    variant_predictor_kernel_size: 3
         | 
| 70 | 
            +
                    variant_predictor_dropout_rate: 0.5
         | 
| 71 | 
            +
                    num_mels: 80
         | 
| 72 | 
            +
                    hidden_dropout_prob: 0.2
         | 
| 73 | 
            +
                    attention_probs_dropout_prob: 0.1
         | 
| 74 | 
            +
                    max_position_embeddings: 2048
         | 
| 75 | 
            +
                    initializer_range: 0.02
         | 
| 76 | 
            +
                    output_attentions: False
         | 
| 77 | 
            +
                    output_hidden_states: False
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            </details>
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            <details>
         | 
| 82 | 
            +
              <summary>Data Loader Setting</summary>
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                batch_size: 8 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
         | 
| 85 | 
            +
                eval_batch_size: 16
         | 
| 86 | 
            +
                remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
         | 
| 87 | 
            +
                allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
         | 
| 88 | 
            +
                mel_length_threshold: 32 # remove all targets has mel_length <= 32
         | 
| 89 | 
            +
                is_shuffle: true # shuffle dataset after each epoch.
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            </details>
         | 
| 92 | 
            +
             | 
| 93 | 
            +
            <details>
         | 
| 94 | 
            +
              <summary>Optimizer & Scheduler Setting</summary>
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                optimizer_params:
         | 
| 97 | 
            +
                    initial_learning_rate: 0.0001
         | 
| 98 | 
            +
                    end_learning_rate: 0.00005
         | 
| 99 | 
            +
                    decay_steps: 150000 # < train_max_steps is recommend.
         | 
| 100 | 
            +
                    warmup_proportion: 0.02
         | 
| 101 | 
            +
                    weight_decay: 0.001
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                gradient_accumulation_steps: 2
         | 
| 104 | 
            +
                var_train_expr:
         | 
| 105 | 
            +
                    null # trainable variable expr (eg. 'embeddings|encoder|decoder' )
         | 
| 106 | 
            +
                    # must separate by |. if var_train_expr is null then we
         | 
| 107 | 
            +
                    # training all variable
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            </details>
         | 
| 110 | 
            +
             | 
| 111 | 
            +
            <details>
         | 
| 112 | 
            +
              <summary>Interval Setting</summary>
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                train_max_steps: 200000 # Number of training steps.
         | 
| 115 | 
            +
                save_interval_steps: 5000 # Interval steps to save checkpoint.
         | 
| 116 | 
            +
                eval_interval_steps: 5000 # Interval steps to evaluate the network.
         | 
| 117 | 
            +
                log_interval_steps: 200 # Interval steps to record the training log.
         | 
| 118 | 
            +
                delay_f0_energy_steps: 3 # 2 steps use LR outputs only then 1 steps LR + F0 + Energy.
         | 
| 119 | 
            +
             | 
| 120 | 
            +
            </details>
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            <details>
         | 
| 123 | 
            +
              <summary>Other Setting</summary>
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                num_save_intermediate_results: 1 # Number of batch to be saved as intermediate results.
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            </details>
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            ## How to Use
         | 
| 130 | 
            +
             | 
| 131 | 
            +
            ```py
         | 
| 132 | 
            +
            import tensorflow as tf
         | 
| 133 | 
            +
            from tensorflow_tts.inference import TFAutoModel, AutoProcessor
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            lightspeech = TFAutoModel.from_pretrained("bookbot/lightspeech-mfa-sw-v1")
         | 
| 136 | 
            +
            processor = AutoProcessor.from_pretrained("bookbot/lightspeech-mfa-sw-v1")
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            text, speaker_name = "Hello World", "sw-KE-OpenBible"
         | 
| 139 | 
            +
            input_ids = processor.text_to_sequence(text)
         | 
| 140 | 
            +
             | 
| 141 | 
            +
            mel, duration_outputs, _ = lightspeech.inference(
         | 
| 142 | 
            +
                input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
         | 
| 143 | 
            +
                speaker_ids=tf.convert_to_tensor(
         | 
| 144 | 
            +
                    [processor.speakers_map[speaker_name]], dtype=tf.int32
         | 
| 145 | 
            +
                ),
         | 
| 146 | 
            +
                speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
         | 
| 147 | 
            +
                f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
         | 
| 148 | 
            +
                energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
         | 
| 149 | 
            +
            )
         | 
| 150 | 
            +
            ```
         | 
| 151 | 
            +
             | 
| 152 | 
            +
            ## Disclaimer
         | 
| 153 | 
            +
             | 
| 154 | 
            +
            Do consider the biases which came from pre-training datasets that may be carried over into the results of this model.
         | 
| 155 | 
            +
             | 
| 156 | 
            +
            ## Authors
         | 
| 157 | 
            +
             | 
| 158 | 
            +
            LightSpeech MFA SW v1 was trained and evaluated by [David Samuel Setiawan](https://davidsamuell.github.io/), [Wilson Wongso](https://wilsonwongso.dev/). All computation and development are done on Scaleway.
         | 
| 159 | 
            +
             | 
| 160 | 
            +
            ## Framework versions
         | 
| 161 | 
            +
             | 
| 162 | 
            +
            - TensorFlowTTS 1.8
         | 
| 163 | 
            +
            - TensorFlow 2.7.0
         | 
    	
        config.yml
    ADDED
    
    | @@ -0,0 +1,74 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            allow_cache: true
         | 
| 2 | 
            +
            batch_size: 8
         | 
| 3 | 
            +
            config: ./TensorFlowTTS/examples/lightspeech/conf/lightspeech_swahiliipa.yaml
         | 
| 4 | 
            +
            dataset_config: TensorFlowTTS/preprocess/swahiliipa_preprocess.yaml
         | 
| 5 | 
            +
            dataset_mapping: dump/swahiliipa_mapper.json
         | 
| 6 | 
            +
            dataset_stats: dump/stats.npy
         | 
| 7 | 
            +
            delay_f0_energy_steps: 3
         | 
| 8 | 
            +
            dev_dir: ./dump/valid/
         | 
| 9 | 
            +
            energy_stat: ./dump/stats_energy.npy
         | 
| 10 | 
            +
            eval_batch_size: 16
         | 
| 11 | 
            +
            eval_interval_steps: 5000
         | 
| 12 | 
            +
            f0_stat: ./dump/stats_f0.npy
         | 
| 13 | 
            +
            format: npy
         | 
| 14 | 
            +
            gradient_accumulation_steps: 2
         | 
| 15 | 
            +
            hop_size: 512
         | 
| 16 | 
            +
            is_shuffle: true
         | 
| 17 | 
            +
            lightspeech_params:
         | 
| 18 | 
            +
              attention_probs_dropout_prob: 0.1
         | 
| 19 | 
            +
              dataset: swahiliipa
         | 
| 20 | 
            +
              decoder_attention_head_size: 16
         | 
| 21 | 
            +
              decoder_hidden_act: mish
         | 
| 22 | 
            +
              decoder_hidden_size: 256
         | 
| 23 | 
            +
              decoder_intermediate_kernel_size:
         | 
| 24 | 
            +
              - 17
         | 
| 25 | 
            +
              - 21
         | 
| 26 | 
            +
              - 9
         | 
| 27 | 
            +
              - 13
         | 
| 28 | 
            +
              decoder_intermediate_size: 1024
         | 
| 29 | 
            +
              decoder_num_attention_heads: 2
         | 
| 30 | 
            +
              decoder_num_hidden_layers: 3
         | 
| 31 | 
            +
              encoder_attention_head_size: 16
         | 
| 32 | 
            +
              encoder_hidden_act: mish
         | 
| 33 | 
            +
              encoder_hidden_size: 256
         | 
| 34 | 
            +
              encoder_intermediate_kernel_size:
         | 
| 35 | 
            +
              - 5
         | 
| 36 | 
            +
              - 25
         | 
| 37 | 
            +
              - 13
         | 
| 38 | 
            +
              - 9
         | 
| 39 | 
            +
              encoder_intermediate_size: 1024
         | 
| 40 | 
            +
              encoder_num_attention_heads: 2
         | 
| 41 | 
            +
              encoder_num_hidden_layers: 3
         | 
| 42 | 
            +
              hidden_dropout_prob: 0.2
         | 
| 43 | 
            +
              initializer_range: 0.02
         | 
| 44 | 
            +
              max_position_embeddings: 2048
         | 
| 45 | 
            +
              n_speakers: 1
         | 
| 46 | 
            +
              num_mels: 80
         | 
| 47 | 
            +
              output_attentions: false
         | 
| 48 | 
            +
              output_hidden_states: false
         | 
| 49 | 
            +
              variant_prediction_num_conv_layers: 2
         | 
| 50 | 
            +
              variant_predictor_dropout_rate: 0.5
         | 
| 51 | 
            +
              variant_predictor_filter: 256
         | 
| 52 | 
            +
              variant_predictor_kernel_size: 3
         | 
| 53 | 
            +
            log_interval_steps: 200
         | 
| 54 | 
            +
            mel_length_threshold: 32
         | 
| 55 | 
            +
            mixed_precision: true
         | 
| 56 | 
            +
            model_type: lightspeech
         | 
| 57 | 
            +
            num_save_intermediate_results: 1
         | 
| 58 | 
            +
            optimizer_params:
         | 
| 59 | 
            +
              decay_steps: 150000
         | 
| 60 | 
            +
              end_learning_rate: 5.0e-05
         | 
| 61 | 
            +
              initial_learning_rate: 0.001
         | 
| 62 | 
            +
              warmup_proportion: 0.02
         | 
| 63 | 
            +
              weight_decay: 0.001
         | 
| 64 | 
            +
            outdir: ./lightspeech-openbible
         | 
| 65 | 
            +
            pretrained: ''
         | 
| 66 | 
            +
            remove_short_samples: true
         | 
| 67 | 
            +
            resume: ''
         | 
| 68 | 
            +
            save_interval_steps: 5000
         | 
| 69 | 
            +
            train_dir: ./dump/train/
         | 
| 70 | 
            +
            train_max_steps: 200000
         | 
| 71 | 
            +
            use_norm: true
         | 
| 72 | 
            +
            var_train_expr: null
         | 
| 73 | 
            +
            verbose: 1
         | 
| 74 | 
            +
            version: '0.0'
         | 
    	
        events.out.tfevents.1712388040.bookbot.343075.0.v2
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2a1dd5aa5acee3af9aadc9ec378fb480c882b7fae3030004ee60b46d56a8c692
         | 
| 3 | 
            +
            size 234708
         | 
    	
        model.h5
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:063c3c6775c287e386c69b98dca7df41de2cff9846bebe1b7f3bd59b02ee24dc
         | 
| 3 | 
            +
            size 19484280
         | 
    	
        processor.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"symbol_to_id": {"@PAD": 0, "@f": 1, "@h": 2, "@i": 3, "@j": 4, "@k": 5, "@l": 6, "@m": 7, "@n": 8, "@p": 9, "@s": 10, "@t": 11, "@t\u0361\u0283": 12, "@u": 13, "@v": 14, "@w": 15, "@x": 16, "@z": 17, "@\u00f0": 18, "@\u0251": 19, "@\u0253": 20, "@\u0254": 21, "@\u0257": 22, "@\u025b": 23, "@\u0260": 24, "@\u0263": 25, "@\u027e": 26, "@\u0283": 27, "@\u0284": 28, "@\u03b8": 29, "@\u1d50\u0253": 30, "@\u1d51g": 31, "@\u1dacv": 32, "@\u207fz": 33, "@\u207f\u0257": 34, "@\u207f\u0257\u0361\u0292": 35, "!": 36, ",": 37, ".": 38, "?": 39, ";": 40, ":": 41, "@SIL": 42, "@EOS": 43}, "id_to_symbol": {"0": "@PAD", "1": "@f", "2": "@h", "3": "@i", "4": "@j", "5": "@k", "6": "@l", "7": "@m", "8": "@n", "9": "@p", "10": "@s", "11": "@t", "12": "@t\u0361\u0283", "13": "@u", "14": "@v", "15": "@w", "16": "@x", "17": "@z", "18": "@\u00f0", "19": "@\u0251", "20": "@\u0253", "21": "@\u0254", "22": "@\u0257", "23": "@\u025b", "24": "@\u0260", "25": "@\u0263", "26": "@\u027e", "27": "@\u0283", "28": "@\u0284", "29": "@\u03b8", "30": "@\u1d50\u0253", "31": "@\u1d51g", "32": "@\u1dacv", "33": "@\u207fz", "34": "@\u207f\u0257", "35": "@\u207f\u0257\u0361\u0292", "36": "!", "37": ",", "38": ".", "39": "?", "40": ";", "41": ":", "42": "@SIL", "43": "@EOS"}, "speakers_map": {"sw-KE-OpenBible": 0}, "processor_name": "SwahiliIPAProcessor"}
         | 

