Training in progress, step 1000
Browse files- .gitignore +1 -0
- added_tokens.json +109 -0
- config.json +41 -0
- evaluate_models.py +73 -0
- fine-tune-whisper-non-streaming.ipynb +1225 -0
- fine-tune-whisper-non-streaming.ipynb.1 +1207 -0
- merges.txt +0 -0
- mgb2_speech.py +152 -0
- normalizer.json +1742 -0
- preprocess_dataset.py +89 -0
- preprocessor_config.json +0 -0
- pytorch_model.bin +3 -0
- run.sh +38 -0
- run_eval_whisper_streaming.py +166 -0
- run_mgb2.sh +37 -0
- run_speech_recognition_seq2seq.py +607 -0
- run_speech_recognition_seq2seq_mixed_mgb2.py +738 -0
- run_speech_recognition_seq2seq_streaming.py +608 -0
- runs/Dec13_21-34-37_129-146-107-47/1670967296.8737977/events.out.tfevents.1670967296.129-146-107-47.73247.1 +3 -0
- runs/Dec13_21-34-37_129-146-107-47/events.out.tfevents.1670967296.129-146-107-47.73247.0 +3 -0
- runs/Dec13_21-37-24_129-146-107-47/1670967464.0219538/events.out.tfevents.1670967464.129-146-107-47.73685.1 +3 -0
- runs/Dec13_21-37-24_129-146-107-47/events.out.tfevents.1670967464.129-146-107-47.73685.0 +3 -0
- setup_env.sh +10 -0
- setup_jupyter.sh +4 -0
- setup_libs.sh +12 -0
- setup_libs_colab.sh +11 -0
- special_tokens_map.json +133 -0
- split_mgb2_test.py +20 -0
- split_xml_mgb2.py +48 -0
- tokenizer_config.json +36 -0
- training_args.bin +3 -0
- vocab.json +0 -0
    	
        .gitignore
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            checkpoint-*/
         | 
    	
        added_tokens.json
    ADDED
    
    | @@ -0,0 +1,109 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "<|af|>": 50327,
         | 
| 3 | 
            +
              "<|am|>": 50334,
         | 
| 4 | 
            +
              "<|ar|>": 50272,
         | 
| 5 | 
            +
              "<|as|>": 50350,
         | 
| 6 | 
            +
              "<|az|>": 50304,
         | 
| 7 | 
            +
              "<|ba|>": 50355,
         | 
| 8 | 
            +
              "<|be|>": 50330,
         | 
| 9 | 
            +
              "<|bg|>": 50292,
         | 
| 10 | 
            +
              "<|bn|>": 50302,
         | 
| 11 | 
            +
              "<|bo|>": 50347,
         | 
| 12 | 
            +
              "<|br|>": 50309,
         | 
| 13 | 
            +
              "<|bs|>": 50315,
         | 
| 14 | 
            +
              "<|ca|>": 50270,
         | 
| 15 | 
            +
              "<|cs|>": 50283,
         | 
| 16 | 
            +
              "<|cy|>": 50297,
         | 
| 17 | 
            +
              "<|da|>": 50285,
         | 
| 18 | 
            +
              "<|de|>": 50261,
         | 
| 19 | 
            +
              "<|el|>": 50281,
         | 
| 20 | 
            +
              "<|endoftext|>": 50257,
         | 
| 21 | 
            +
              "<|en|>": 50259,
         | 
| 22 | 
            +
              "<|es|>": 50262,
         | 
| 23 | 
            +
              "<|et|>": 50307,
         | 
| 24 | 
            +
              "<|eu|>": 50310,
         | 
| 25 | 
            +
              "<|fa|>": 50300,
         | 
| 26 | 
            +
              "<|fi|>": 50277,
         | 
| 27 | 
            +
              "<|fo|>": 50338,
         | 
| 28 | 
            +
              "<|fr|>": 50265,
         | 
| 29 | 
            +
              "<|gl|>": 50319,
         | 
| 30 | 
            +
              "<|gu|>": 50333,
         | 
| 31 | 
            +
              "<|haw|>": 50352,
         | 
| 32 | 
            +
              "<|ha|>": 50354,
         | 
| 33 | 
            +
              "<|hi|>": 50276,
         | 
| 34 | 
            +
              "<|hr|>": 50291,
         | 
| 35 | 
            +
              "<|ht|>": 50339,
         | 
| 36 | 
            +
              "<|hu|>": 50286,
         | 
| 37 | 
            +
              "<|hy|>": 50312,
         | 
| 38 | 
            +
              "<|id|>": 50275,
         | 
| 39 | 
            +
              "<|is|>": 50311,
         | 
| 40 | 
            +
              "<|it|>": 50274,
         | 
| 41 | 
            +
              "<|iw|>": 50279,
         | 
| 42 | 
            +
              "<|ja|>": 50266,
         | 
| 43 | 
            +
              "<|jw|>": 50356,
         | 
| 44 | 
            +
              "<|ka|>": 50329,
         | 
| 45 | 
            +
              "<|kk|>": 50316,
         | 
| 46 | 
            +
              "<|km|>": 50323,
         | 
| 47 | 
            +
              "<|kn|>": 50306,
         | 
| 48 | 
            +
              "<|ko|>": 50264,
         | 
| 49 | 
            +
              "<|la|>": 50294,
         | 
| 50 | 
            +
              "<|lb|>": 50345,
         | 
| 51 | 
            +
              "<|ln|>": 50353,
         | 
| 52 | 
            +
              "<|lo|>": 50336,
         | 
| 53 | 
            +
              "<|lt|>": 50293,
         | 
| 54 | 
            +
              "<|lv|>": 50301,
         | 
| 55 | 
            +
              "<|mg|>": 50349,
         | 
| 56 | 
            +
              "<|mi|>": 50295,
         | 
| 57 | 
            +
              "<|mk|>": 50308,
         | 
| 58 | 
            +
              "<|ml|>": 50296,
         | 
| 59 | 
            +
              "<|mn|>": 50314,
         | 
| 60 | 
            +
              "<|mr|>": 50320,
         | 
| 61 | 
            +
              "<|ms|>": 50282,
         | 
| 62 | 
            +
              "<|mt|>": 50343,
         | 
| 63 | 
            +
              "<|my|>": 50346,
         | 
| 64 | 
            +
              "<|ne|>": 50313,
         | 
| 65 | 
            +
              "<|nl|>": 50271,
         | 
| 66 | 
            +
              "<|nn|>": 50342,
         | 
| 67 | 
            +
              "<|nocaptions|>": 50362,
         | 
| 68 | 
            +
              "<|notimestamps|>": 50363,
         | 
| 69 | 
            +
              "<|no|>": 50288,
         | 
| 70 | 
            +
              "<|oc|>": 50328,
         | 
| 71 | 
            +
              "<|pa|>": 50321,
         | 
| 72 | 
            +
              "<|pl|>": 50269,
         | 
| 73 | 
            +
              "<|ps|>": 50340,
         | 
| 74 | 
            +
              "<|pt|>": 50267,
         | 
| 75 | 
            +
              "<|ro|>": 50284,
         | 
| 76 | 
            +
              "<|ru|>": 50263,
         | 
| 77 | 
            +
              "<|sa|>": 50344,
         | 
| 78 | 
            +
              "<|sd|>": 50332,
         | 
| 79 | 
            +
              "<|si|>": 50322,
         | 
| 80 | 
            +
              "<|sk|>": 50298,
         | 
| 81 | 
            +
              "<|sl|>": 50305,
         | 
| 82 | 
            +
              "<|sn|>": 50324,
         | 
| 83 | 
            +
              "<|so|>": 50326,
         | 
| 84 | 
            +
              "<|sq|>": 50317,
         | 
| 85 | 
            +
              "<|sr|>": 50303,
         | 
| 86 | 
            +
              "<|startoflm|>": 50360,
         | 
| 87 | 
            +
              "<|startofprev|>": 50361,
         | 
| 88 | 
            +
              "<|startoftranscript|>": 50258,
         | 
| 89 | 
            +
              "<|su|>": 50357,
         | 
| 90 | 
            +
              "<|sv|>": 50273,
         | 
| 91 | 
            +
              "<|sw|>": 50318,
         | 
| 92 | 
            +
              "<|ta|>": 50287,
         | 
| 93 | 
            +
              "<|te|>": 50299,
         | 
| 94 | 
            +
              "<|tg|>": 50331,
         | 
| 95 | 
            +
              "<|th|>": 50289,
         | 
| 96 | 
            +
              "<|tk|>": 50341,
         | 
| 97 | 
            +
              "<|tl|>": 50348,
         | 
| 98 | 
            +
              "<|transcribe|>": 50359,
         | 
| 99 | 
            +
              "<|translate|>": 50358,
         | 
| 100 | 
            +
              "<|tr|>": 50268,
         | 
| 101 | 
            +
              "<|tt|>": 50351,
         | 
| 102 | 
            +
              "<|uk|>": 50280,
         | 
| 103 | 
            +
              "<|ur|>": 50290,
         | 
| 104 | 
            +
              "<|uz|>": 50337,
         | 
| 105 | 
            +
              "<|vi|>": 50278,
         | 
| 106 | 
            +
              "<|yi|>": 50335,
         | 
| 107 | 
            +
              "<|yo|>": 50325,
         | 
| 108 | 
            +
              "<|zh|>": 50260
         | 
| 109 | 
            +
            }
         | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,41 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_name_or_path": "openai/whisper-medium",
         | 
| 3 | 
            +
              "activation_dropout": 0.0,
         | 
| 4 | 
            +
              "activation_function": "gelu",
         | 
| 5 | 
            +
              "architectures": [
         | 
| 6 | 
            +
                "WhisperForConditionalGeneration"
         | 
| 7 | 
            +
              ],
         | 
| 8 | 
            +
              "attention_dropout": 0.0,
         | 
| 9 | 
            +
              "begin_suppress_tokens": [
         | 
| 10 | 
            +
                220,
         | 
| 11 | 
            +
                50257
         | 
| 12 | 
            +
              ],
         | 
| 13 | 
            +
              "bos_token_id": 50257,
         | 
| 14 | 
            +
              "d_model": 1024,
         | 
| 15 | 
            +
              "decoder_attention_heads": 16,
         | 
| 16 | 
            +
              "decoder_ffn_dim": 4096,
         | 
| 17 | 
            +
              "decoder_layerdrop": 0.0,
         | 
| 18 | 
            +
              "decoder_layers": 24,
         | 
| 19 | 
            +
              "decoder_start_token_id": 50258,
         | 
| 20 | 
            +
              "dropout": 0.0,
         | 
| 21 | 
            +
              "encoder_attention_heads": 16,
         | 
| 22 | 
            +
              "encoder_ffn_dim": 4096,
         | 
| 23 | 
            +
              "encoder_layerdrop": 0.0,
         | 
| 24 | 
            +
              "encoder_layers": 24,
         | 
| 25 | 
            +
              "eos_token_id": 50257,
         | 
| 26 | 
            +
              "forced_decoder_ids": null,
         | 
| 27 | 
            +
              "init_std": 0.02,
         | 
| 28 | 
            +
              "is_encoder_decoder": true,
         | 
| 29 | 
            +
              "max_length": 448,
         | 
| 30 | 
            +
              "max_source_positions": 1500,
         | 
| 31 | 
            +
              "max_target_positions": 448,
         | 
| 32 | 
            +
              "model_type": "whisper",
         | 
| 33 | 
            +
              "num_hidden_layers": 24,
         | 
| 34 | 
            +
              "num_mel_bins": 80,
         | 
| 35 | 
            +
              "pad_token_id": 50257,
         | 
| 36 | 
            +
              "scale_embedding": false,
         | 
| 37 | 
            +
              "torch_dtype": "float32",
         | 
| 38 | 
            +
              "transformers_version": "4.26.0.dev0",
         | 
| 39 | 
            +
              "use_cache": false,
         | 
| 40 | 
            +
              "vocab_size": 51865
         | 
| 41 | 
            +
            }
         | 
    	
        evaluate_models.py
    ADDED
    
    | @@ -0,0 +1,73 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import torch
         | 
| 2 | 
            +
            import librosa
         | 
| 3 | 
            +
            from datasets import load_dataset, Audio
         | 
| 4 | 
            +
            from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer, WhisperForConditionalGeneration
         | 
| 5 | 
            +
            from huggingface_hub import login
         | 
| 6 | 
            +
            import argparse
         | 
| 7 | 
            +
            from evaluate import load
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            my_parser = argparse.ArgumentParser()
         | 
| 10 | 
            +
            # my_parser.add_argument("--pal", "-paths_as_labels", action="store_true")
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            my_parser.add_argument("--model_name", "-model_name", type=str, action="store", default = "openai/whisper-tiny")
         | 
| 13 | 
            +
            my_parser.add_argument("--hf_token", "-hf_token", type=str, action="store")
         | 
| 14 | 
            +
            my_parser.add_argument("--dataset_name", "-dataset_name", type=str, action="store", default = "google/fleurs")
         | 
| 15 | 
            +
            my_parser.add_argument("--split", "-split", type=str, action="store", default = "test")
         | 
| 16 | 
            +
            my_parser.add_argument("--subset", "-subset", type=str, action="store")
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            args = my_parser.parse_args()
         | 
| 19 | 
            +
            try:
         | 
| 20 | 
            +
              login(args.hf_token)
         | 
| 21 | 
            +
            except:
         | 
| 22 | 
            +
              raise(f"Can't login please set --hf_token {args.hf_token}")
         | 
| 23 | 
            +
             | 
| 24 | 
            +
             | 
| 25 | 
            +
            dataset_name = args.dataset_name 
         | 
| 26 | 
            +
            model_name = args.model_name
         | 
| 27 | 
            +
            subset = args.subset
         | 
| 28 | 
            +
            text_column = "sentence"
         | 
| 29 | 
            +
            if dataset_name == "google/fleurs":
         | 
| 30 | 
            +
              text_column = "transcription"
         | 
| 31 | 
            +
              
         | 
| 32 | 
            +
            print(f"Evaluating {args.model_name} on {args.dataset_name} [{subset}]")
         | 
| 33 | 
            +
             | 
| 34 | 
            +
             | 
| 35 | 
            +
            feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
         | 
| 36 | 
            +
            model = WhisperForConditionalGeneration.from_pretrained(model_name)
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            test_dataset = load_dataset(dataset_name, subset, split=args.split, use_auth_token=True)
         | 
| 39 | 
            +
            processor = WhisperProcessor.from_pretrained(model_name, language="Arabic", task="transcribe")
         | 
| 40 | 
            +
            tokenizer = WhisperTokenizer.from_pretrained(model_name, language="Arabic", task="transcribe")
         | 
| 41 | 
            +
            test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            # Preprocessing the datasets.
         | 
| 44 | 
            +
            def prepare_dataset(batch):
         | 
| 45 | 
            +
                # load and resample audio data from 48 to 16kHz
         | 
| 46 | 
            +
                audio = batch["audio"]
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                # compute log-Mel input features from input audio array 
         | 
| 49 | 
            +
                batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                # encode target text to label ids 
         | 
| 52 | 
            +
                batch["labels"] = tokenizer(batch[text_column]).input_ids
         | 
| 53 | 
            +
                return batch
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            test_dataset = test_dataset.map(prepare_dataset)
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            model = model.to("cuda")
         | 
| 58 | 
            +
            model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language = "ar", task = "transcribe")
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            def map_to_result(batch):
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              with torch.no_grad():
         | 
| 63 | 
            +
                input_values = torch.tensor(batch["input_features"], device="cuda").unsqueeze(0)
         | 
| 64 | 
            +
                pred_ids = model.generate(input_values)
         | 
| 65 | 
            +
             | 
| 66 | 
            +
              batch["pred_str"] = processor.batch_decode(pred_ids, skip_special_tokens = True)[0]
         | 
| 67 | 
            +
              batch["text"] = processor.decode(batch["labels"], skip_special_tokens = True)
         | 
| 68 | 
            +
              
         | 
| 69 | 
            +
              return batch
         | 
| 70 | 
            +
            results = test_dataset.map(map_to_result)
         | 
| 71 | 
            +
             | 
| 72 | 
            +
            wer = load("wer")
         | 
| 73 | 
            +
            print("Test WER: {:.3f}".format(wer.compute(predictions=results["pred_str"], references=results["text"])))
         | 
    	
        fine-tune-whisper-non-streaming.ipynb
    ADDED
    
    | @@ -0,0 +1,1225 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
             "cells": [
         | 
| 3 | 
            +
              {
         | 
| 4 | 
            +
               "cell_type": "markdown",
         | 
| 5 | 
            +
               "id": "75b58048-7d14-4fc6-8085-1fc08c81b4a6",
         | 
| 6 | 
            +
               "metadata": {
         | 
| 7 | 
            +
                "id": "75b58048-7d14-4fc6-8085-1fc08c81b4a6"
         | 
| 8 | 
            +
               },
         | 
| 9 | 
            +
               "source": [
         | 
| 10 | 
            +
                "# Fine-Tune Whisper For Multilingual ASR with 🤗 Transformers"
         | 
| 11 | 
            +
               ]
         | 
| 12 | 
            +
              },
         | 
| 13 | 
            +
              {
         | 
| 14 | 
            +
               "cell_type": "markdown",
         | 
| 15 | 
            +
               "id": "fbfa8ad5-4cdc-4512-9058-836cbbf65e1a",
         | 
| 16 | 
            +
               "metadata": {
         | 
| 17 | 
            +
                "id": "fbfa8ad5-4cdc-4512-9058-836cbbf65e1a"
         | 
| 18 | 
            +
               },
         | 
| 19 | 
            +
               "source": [
         | 
| 20 | 
            +
                "In this Colab, we present a step-by-step guide on how to fine-tune Whisper \n",
         | 
| 21 | 
            +
                "for any multilingual ASR dataset using Hugging Face 🤗 Transformers. This is a \n",
         | 
| 22 | 
            +
                "more \"hands-on\" version of the accompanying [blog post](https://huggingface.co/blog/fine-tune-whisper). \n",
         | 
| 23 | 
            +
                "For a more in-depth explanation of Whisper, the Common Voice dataset and the theory behind fine-tuning, the reader is advised to refer to the blog post."
         | 
| 24 | 
            +
               ]
         | 
| 25 | 
            +
              },
         | 
| 26 | 
            +
              {
         | 
| 27 | 
            +
               "cell_type": "markdown",
         | 
| 28 | 
            +
               "id": "afe0d503-ae4e-4aa7-9af4-dbcba52db41e",
         | 
| 29 | 
            +
               "metadata": {
         | 
| 30 | 
            +
                "id": "afe0d503-ae4e-4aa7-9af4-dbcba52db41e"
         | 
| 31 | 
            +
               },
         | 
| 32 | 
            +
               "source": [
         | 
| 33 | 
            +
                "## Introduction"
         | 
| 34 | 
            +
               ]
         | 
| 35 | 
            +
              },
         | 
| 36 | 
            +
              {
         | 
| 37 | 
            +
               "cell_type": "markdown",
         | 
| 38 | 
            +
               "id": "9ae91ed4-9c3e-4ade-938e-f4c2dcfbfdc0",
         | 
| 39 | 
            +
               "metadata": {
         | 
| 40 | 
            +
                "id": "9ae91ed4-9c3e-4ade-938e-f4c2dcfbfdc0"
         | 
| 41 | 
            +
               },
         | 
| 42 | 
            +
               "source": [
         | 
| 43 | 
            +
                "Whisper is a pre-trained model for automatic speech recognition (ASR) \n",
         | 
| 44 | 
            +
                "published in [September 2022](https://openai.com/blog/whisper/) by the authors \n",
         | 
| 45 | 
            +
                "Alec Radford et al. from OpenAI. Unlike many of its predecessors, such as \n",
         | 
| 46 | 
            +
                "[Wav2Vec 2.0](https://arxiv.org/abs/2006.11477), which are pre-trained \n",
         | 
| 47 | 
            +
                "on un-labelled audio data, Whisper is pre-trained on a vast quantity of \n",
         | 
| 48 | 
            +
                "**labelled** audio-transcription data, 680,000 hours to be precise. \n",
         | 
| 49 | 
            +
                "This is an order of magnitude more data than the un-labelled audio data used \n",
         | 
| 50 | 
            +
                "to train Wav2Vec 2.0 (60,000 hours). What is more, 117,000 hours of this \n",
         | 
| 51 | 
            +
                "pre-training data is multilingual ASR data. This results in checkpoints \n",
         | 
| 52 | 
            +
                "that can be applied to over 96 languages, many of which are considered \n",
         | 
| 53 | 
            +
                "_low-resource_.\n",
         | 
| 54 | 
            +
                "\n",
         | 
| 55 | 
            +
                "When scaled to 680,000 hours of labelled pre-training data, Whisper models \n",
         | 
| 56 | 
            +
                "demonstrate a strong ability to generalise to many datasets and domains.\n",
         | 
| 57 | 
            +
                "The pre-trained checkpoints achieve competitive results to state-of-the-art \n",
         | 
| 58 | 
            +
                "ASR systems, with near 3% word error rate (WER) on the test-clean subset of \n",
         | 
| 59 | 
            +
                "LibriSpeech ASR and a new state-of-the-art on TED-LIUM with 4.7% WER (_c.f._ \n",
         | 
| 60 | 
            +
                "Table 8 of the [Whisper paper](https://cdn.openai.com/papers/whisper.pdf)).\n",
         | 
| 61 | 
            +
                "The extensive multilingual ASR knowledge acquired by Whisper during pre-training \n",
         | 
| 62 | 
            +
                "can be leveraged for other low-resource languages; through fine-tuning, the \n",
         | 
| 63 | 
            +
                "pre-trained checkpoints can be adapted for specific datasets and languages \n",
         | 
| 64 | 
            +
                "to further improve upon these results. We'll show just how Whisper can be fine-tuned \n",
         | 
| 65 | 
            +
                "for low-resource languages in this Colab."
         | 
| 66 | 
            +
               ]
         | 
| 67 | 
            +
              },
         | 
| 68 | 
            +
              {
         | 
| 69 | 
            +
               "cell_type": "markdown",
         | 
| 70 | 
            +
               "id": "e59b91d6-be24-4b5e-bb38-4977ea143a72",
         | 
| 71 | 
            +
               "metadata": {
         | 
| 72 | 
            +
                "id": "e59b91d6-be24-4b5e-bb38-4977ea143a72"
         | 
| 73 | 
            +
               },
         | 
| 74 | 
            +
               "source": [
         | 
| 75 | 
            +
                "<figure>\n",
         | 
| 76 | 
            +
                "<img src=\"https://raw.githubusercontent.com/sanchit-gandhi/notebooks/main/whisper_architecture.svg\" alt=\"Trulli\" style=\"width:100%\">\n",
         | 
| 77 | 
            +
                "<figcaption align = \"center\"><b>Figure 1:</b> Whisper model. The architecture \n",
         | 
| 78 | 
            +
                "follows the standard Transformer-based encoder-decoder model. A \n",
         | 
| 79 | 
            +
                "log-Mel spectrogram is input to the encoder. The last encoder \n",
         | 
| 80 | 
            +
                "hidden states are input to the decoder via cross-attention mechanisms. The \n",
         | 
| 81 | 
            +
                "decoder autoregressively predicts text tokens, jointly conditional on the \n",
         | 
| 82 | 
            +
                "encoder hidden states and previously predicted tokens. Figure source: \n",
         | 
| 83 | 
            +
                "<a href=\"https://openai.com/blog/whisper/\">OpenAI Whisper Blog</a>.</figcaption>\n",
         | 
| 84 | 
            +
                "</figure>"
         | 
| 85 | 
            +
               ]
         | 
| 86 | 
            +
              },
         | 
| 87 | 
            +
              {
         | 
| 88 | 
            +
               "cell_type": "markdown",
         | 
| 89 | 
            +
               "id": "21b6316e-8a55-4549-a154-66d3da2ab74a",
         | 
| 90 | 
            +
               "metadata": {
         | 
| 91 | 
            +
                "id": "21b6316e-8a55-4549-a154-66d3da2ab74a"
         | 
| 92 | 
            +
               },
         | 
| 93 | 
            +
               "source": [
         | 
| 94 | 
            +
                "The Whisper checkpoints come in five configurations of varying model sizes.\n",
         | 
| 95 | 
            +
                "The smallest four are trained on either English-only or multilingual data.\n",
         | 
| 96 | 
            +
                "The largest checkpoint is multilingual only. All nine of the pre-trained checkpoints \n",
         | 
| 97 | 
            +
                "are available on the [Hugging Face Hub](https://huggingface.co/models?search=openai/whisper). The \n",
         | 
| 98 | 
            +
                "checkpoints are summarised in the following table with links to the models on the Hub:\n",
         | 
| 99 | 
            +
                "\n",
         | 
| 100 | 
            +
                "| Size   | Layers | Width | Heads | Parameters | English-only                                         | Multilingual                                      |\n",
         | 
| 101 | 
            +
                "|--------|--------|-------|-------|------------|------------------------------------------------------|---------------------------------------------------|\n",
         | 
| 102 | 
            +
                "| tiny   | 4      | 384   | 6     | 39 M       | [✓](https://huggingface.co/openai/whisper-tiny.en)   | [✓](https://huggingface.co/openai/whisper-tiny.)  |\n",
         | 
| 103 | 
            +
                "| base   | 6      | 512   | 8     | 74 M       | [✓](https://huggingface.co/openai/whisper-base.en)   | [✓](https://huggingface.co/openai/whisper-base)   |\n",
         | 
| 104 | 
            +
                "| small  | 12     | 768   | 12    | 244 M      | [✓](https://huggingface.co/openai/whisper-small.en)  | [✓](https://huggingface.co/openai/whisper-small)  |\n",
         | 
| 105 | 
            +
                "| medium | 24     | 1024  | 16    | 769 M      | [✓](https://huggingface.co/openai/whisper-medium.en) | [✓](https://huggingface.co/openai/whisper-medium) |\n",
         | 
| 106 | 
            +
                "| large  | 32     | 1280  | 20    | 1550 M     | x                                                    | [✓](https://huggingface.co/openai/whisper-large)  |\n",
         | 
| 107 | 
            +
                "\n",
         | 
| 108 | 
            +
                "For demonstration purposes, we'll fine-tune the multilingual version of the \n",
         | 
| 109 | 
            +
                "[`\"small\"`](https://huggingface.co/openai/whisper-small) checkpoint with 244M params (~= 1GB). \n",
         | 
| 110 | 
            +
                "As for our data, we'll train and evaluate our system on a low-resource language \n",
         | 
| 111 | 
            +
                "taken from the [Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0)\n",
         | 
| 112 | 
            +
                "dataset. We'll show that with as little as 8 hours of fine-tuning data, we can achieve \n",
         | 
| 113 | 
            +
                "strong performance in this language."
         | 
| 114 | 
            +
               ]
         | 
| 115 | 
            +
              },
         | 
| 116 | 
            +
              {
         | 
| 117 | 
            +
               "cell_type": "markdown",
         | 
| 118 | 
            +
               "id": "3a680dfc-cbba-4f6c-8a1f-e1a5ff3f123a",
         | 
| 119 | 
            +
               "metadata": {
         | 
| 120 | 
            +
                "id": "3a680dfc-cbba-4f6c-8a1f-e1a5ff3f123a"
         | 
| 121 | 
            +
               },
         | 
| 122 | 
            +
               "source": [
         | 
| 123 | 
            +
                "------------------------------------------------------------------------\n",
         | 
| 124 | 
            +
                "\n",
         | 
| 125 | 
            +
                "\\\\({}^1\\\\) The name Whisper follows from the acronym “WSPSR”, which stands for “Web-scale Supervised Pre-training for Speech Recognition”."
         | 
| 126 | 
            +
               ]
         | 
| 127 | 
            +
              },
         | 
| 128 | 
            +
              {
         | 
| 129 | 
            +
               "cell_type": "markdown",
         | 
| 130 | 
            +
               "id": "b219c9dd-39b6-4a95-b2a1-3f547a1e7bc0",
         | 
| 131 | 
            +
               "metadata": {
         | 
| 132 | 
            +
                "id": "b219c9dd-39b6-4a95-b2a1-3f547a1e7bc0"
         | 
| 133 | 
            +
               },
         | 
| 134 | 
            +
               "source": [
         | 
| 135 | 
            +
                "## Load Dataset"
         | 
| 136 | 
            +
               ]
         | 
| 137 | 
            +
              },
         | 
| 138 | 
            +
              {
         | 
| 139 | 
            +
               "attachments": {},
         | 
| 140 | 
            +
               "cell_type": "markdown",
         | 
| 141 | 
            +
               "id": "674429c5-0ab4-4adf-975b-621bb69eca38",
         | 
| 142 | 
            +
               "metadata": {
         | 
| 143 | 
            +
                "id": "674429c5-0ab4-4adf-975b-621bb69eca38"
         | 
| 144 | 
            +
               },
         | 
| 145 | 
            +
               "source": [
         | 
| 146 | 
            +
                "Using 🤗 Datasets, downloading and preparing data is extremely simple. \n",
         | 
| 147 | 
            +
                "We can download and prepare the Common Voice splits in just one line of code. \n",
         | 
| 148 | 
            +
                "\n",
         | 
| 149 | 
            +
                "First, ensure you have accepted the terms of use on the Hugging Face Hub: [mozilla-foundation/common_voice_11_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0). Once you have accepted the terms, you will have full access to the dataset and be able to download the data locally.\n",
         | 
| 150 | 
            +
                "\n",
         | 
| 151 | 
            +
                "Since Arabic is very low-resource, we'll combine the `train` and `validation` \n",
         | 
| 152 | 
            +
                "splits to give approximately 8 hours of training data. We'll use the 4 hours \n",
         | 
| 153 | 
            +
                "of `test` data as our held-out test set:"
         | 
| 154 | 
            +
               ]
         | 
| 155 | 
            +
              },
         | 
| 156 | 
            +
              {
         | 
| 157 | 
            +
               "cell_type": "code",
         | 
| 158 | 
            +
               "execution_count": null,
         | 
| 159 | 
            +
               "id": "5de15d70",
         | 
| 160 | 
            +
               "metadata": {},
         | 
| 161 | 
            +
               "outputs": [],
         | 
| 162 | 
            +
               "source": [
         | 
| 163 | 
            +
                "model_name = \"openai/whisper-small\""
         | 
| 164 | 
            +
               ]
         | 
| 165 | 
            +
              },
         | 
| 166 | 
            +
              {
         | 
| 167 | 
            +
               "cell_type": "code",
         | 
| 168 | 
            +
               "execution_count": null,
         | 
| 169 | 
            +
               "id": "a2787582-554f-44ce-9f38-4180a5ed6b44",
         | 
| 170 | 
            +
               "metadata": {
         | 
| 171 | 
            +
                "id": "a2787582-554f-44ce-9f38-4180a5ed6b44"
         | 
| 172 | 
            +
               },
         | 
| 173 | 
            +
               "outputs": [],
         | 
| 174 | 
            +
               "source": [
         | 
| 175 | 
            +
                "from datasets import load_dataset, DatasetDict\n",
         | 
| 176 | 
            +
                "\n",
         | 
| 177 | 
            +
                "common_voice = DatasetDict()\n",
         | 
| 178 | 
            +
                "\n",
         | 
| 179 | 
            +
                "common_voice[\"train\"] = load_dataset(\"mozilla-foundation/common_voice_11_0\", \"ar\", split=\"train+validation\", use_auth_token=True)\n",
         | 
| 180 | 
            +
                "common_voice[\"test\"] = load_dataset(\"mozilla-foundation/common_voice_11_0\", \"ar\", split=\"test\", use_auth_token=True)\n",
         | 
| 181 | 
            +
                "\n",
         | 
| 182 | 
            +
                "print(common_voice)"
         | 
| 183 | 
            +
               ]
         | 
| 184 | 
            +
              },
         | 
| 185 | 
            +
              {
         | 
| 186 | 
            +
               "cell_type": "markdown",
         | 
| 187 | 
            +
               "id": "d5c7c3d6-7197-41e7-a088-49b753c1681f",
         | 
| 188 | 
            +
               "metadata": {
         | 
| 189 | 
            +
                "id": "d5c7c3d6-7197-41e7-a088-49b753c1681f"
         | 
| 190 | 
            +
               },
         | 
| 191 | 
            +
               "source": [
         | 
| 192 | 
            +
                "Most ASR datasets only provide input audio samples (`audio`) and the \n",
         | 
| 193 | 
            +
                "corresponding transcribed text (`sentence`). Common Voice contains additional \n",
         | 
| 194 | 
            +
                "metadata information, such as `accent` and `locale`, which we can disregard for ASR.\n",
         | 
| 195 | 
            +
                "Keeping the notebook as general as possible, we only consider the input audio and\n",
         | 
| 196 | 
            +
                "transcribed text for fine-tuning, discarding the additional metadata information:"
         | 
| 197 | 
            +
               ]
         | 
| 198 | 
            +
              },
         | 
| 199 | 
            +
              {
         | 
| 200 | 
            +
               "cell_type": "code",
         | 
| 201 | 
            +
               "execution_count": null,
         | 
| 202 | 
            +
               "id": "20ba635d-518c-47ac-97ee-3cad25f1e0ce",
         | 
| 203 | 
            +
               "metadata": {
         | 
| 204 | 
            +
                "id": "20ba635d-518c-47ac-97ee-3cad25f1e0ce"
         | 
| 205 | 
            +
               },
         | 
| 206 | 
            +
               "outputs": [],
         | 
| 207 | 
            +
               "source": [
         | 
| 208 | 
            +
                "common_voice = common_voice.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"path\", \"segment\", \"up_votes\"])\n",
         | 
| 209 | 
            +
                "\n",
         | 
| 210 | 
            +
                "print(common_voice)"
         | 
| 211 | 
            +
               ]
         | 
| 212 | 
            +
              },
         | 
| 213 | 
            +
              {
         | 
| 214 | 
            +
               "cell_type": "markdown",
         | 
| 215 | 
            +
               "id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605",
         | 
| 216 | 
            +
               "metadata": {
         | 
| 217 | 
            +
                "id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605"
         | 
| 218 | 
            +
               },
         | 
| 219 | 
            +
               "source": [
         | 
| 220 | 
            +
                "## Prepare Feature Extractor, Tokenizer and Data"
         | 
| 221 | 
            +
               ]
         | 
| 222 | 
            +
              },
         | 
| 223 | 
            +
              {
         | 
| 224 | 
            +
               "cell_type": "markdown",
         | 
| 225 | 
            +
               "id": "601c3099-1026-439e-93e2-5635b3ba5a73",
         | 
| 226 | 
            +
               "metadata": {
         | 
| 227 | 
            +
                "id": "601c3099-1026-439e-93e2-5635b3ba5a73"
         | 
| 228 | 
            +
               },
         | 
| 229 | 
            +
               "source": [
         | 
| 230 | 
            +
                "The ASR pipeline can be de-composed into three stages: \n",
         | 
| 231 | 
            +
                "1) A feature extractor which pre-processes the raw audio-inputs\n",
         | 
| 232 | 
            +
                "2) The model which performs the sequence-to-sequence mapping \n",
         | 
| 233 | 
            +
                "3) A tokenizer which post-processes the model outputs to text format\n",
         | 
| 234 | 
            +
                "\n",
         | 
| 235 | 
            +
                "In 🤗 Transformers, the Whisper model has an associated feature extractor and tokenizer, \n",
         | 
| 236 | 
            +
                "called [WhisperFeatureExtractor](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperFeatureExtractor)\n",
         | 
| 237 | 
            +
                "and [WhisperTokenizer](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperTokenizer) \n",
         | 
| 238 | 
            +
                "respectively.\n",
         | 
| 239 | 
            +
                "\n",
         | 
| 240 | 
            +
                "We'll go through details for setting-up the feature extractor and tokenizer one-by-one!"
         | 
| 241 | 
            +
               ]
         | 
| 242 | 
            +
              },
         | 
| 243 | 
            +
              {
         | 
| 244 | 
            +
               "cell_type": "markdown",
         | 
| 245 | 
            +
               "id": "560332eb-3558-41a1-b500-e83a9f695f84",
         | 
| 246 | 
            +
               "metadata": {
         | 
| 247 | 
            +
                "id": "560332eb-3558-41a1-b500-e83a9f695f84"
         | 
| 248 | 
            +
               },
         | 
| 249 | 
            +
               "source": [
         | 
| 250 | 
            +
                "### Load WhisperFeatureExtractor"
         | 
| 251 | 
            +
               ]
         | 
| 252 | 
            +
              },
         | 
| 253 | 
            +
              {
         | 
| 254 | 
            +
               "cell_type": "markdown",
         | 
| 255 | 
            +
               "id": "32ec8068-0bd7-412d-b662-0edb9d1e7365",
         | 
| 256 | 
            +
               "metadata": {
         | 
| 257 | 
            +
                "id": "32ec8068-0bd7-412d-b662-0edb9d1e7365"
         | 
| 258 | 
            +
               },
         | 
| 259 | 
            +
               "source": [
         | 
| 260 | 
            +
                "The Whisper feature extractor performs two operations:\n",
         | 
| 261 | 
            +
                "1. Pads / truncates the audio inputs to 30s: any audio inputs shorter than 30s are padded to 30s with silence (zeros), and those longer that 30s are truncated to 30s\n",
         | 
| 262 | 
            +
                "2. Converts the audio inputs to _log-Mel spectrogram_ input features, a visual representation of the audio and the form of the input expected by the Whisper model"
         | 
| 263 | 
            +
               ]
         | 
| 264 | 
            +
              },
         | 
| 265 | 
            +
              {
         | 
| 266 | 
            +
               "cell_type": "markdown",
         | 
| 267 | 
            +
               "id": "589d9ec1-d12b-4b64-93f7-04c63997da19",
         | 
| 268 | 
            +
               "metadata": {
         | 
| 269 | 
            +
                "id": "589d9ec1-d12b-4b64-93f7-04c63997da19"
         | 
| 270 | 
            +
               },
         | 
| 271 | 
            +
               "source": [
         | 
| 272 | 
            +
                "<figure>\n",
         | 
| 273 | 
            +
                "<img src=\"https://raw.githubusercontent.com/sanchit-gandhi/notebooks/main/spectrogram.jpg\" alt=\"Trulli\" style=\"width:100%\">\n",
         | 
| 274 | 
            +
                "<figcaption align = \"center\"><b>Figure 2:</b> Conversion of sampled audio array to log-Mel spectrogram.\n",
         | 
| 275 | 
            +
                "Left: sampled 1-dimensional audio signal. Right: corresponding log-Mel spectrogram. Figure source:\n",
         | 
| 276 | 
            +
                "<a href=\"https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html\">Google SpecAugment Blog</a>.\n",
         | 
| 277 | 
            +
                "</figcaption>"
         | 
| 278 | 
            +
               ]
         | 
| 279 | 
            +
              },
         | 
| 280 | 
            +
              {
         | 
| 281 | 
            +
               "cell_type": "markdown",
         | 
| 282 | 
            +
               "id": "b2ef54d5-b946-4c1d-9fdc-adc5d01b46aa",
         | 
| 283 | 
            +
               "metadata": {
         | 
| 284 | 
            +
                "id": "b2ef54d5-b946-4c1d-9fdc-adc5d01b46aa"
         | 
| 285 | 
            +
               },
         | 
| 286 | 
            +
               "source": [
         | 
| 287 | 
            +
                "We'll load the feature extractor from the pre-trained checkpoint with the default values:"
         | 
| 288 | 
            +
               ]
         | 
| 289 | 
            +
              },
         | 
| 290 | 
            +
              {
         | 
| 291 | 
            +
               "cell_type": "code",
         | 
| 292 | 
            +
               "execution_count": null,
         | 
| 293 | 
            +
               "id": "bc77d7bb-f9e2-47f5-b663-30f7a4321ce5",
         | 
| 294 | 
            +
               "metadata": {
         | 
| 295 | 
            +
                "id": "bc77d7bb-f9e2-47f5-b663-30f7a4321ce5"
         | 
| 296 | 
            +
               },
         | 
| 297 | 
            +
               "outputs": [],
         | 
| 298 | 
            +
               "source": [
         | 
| 299 | 
            +
                "from transformers import WhisperFeatureExtractor\n",
         | 
| 300 | 
            +
                "\n",
         | 
| 301 | 
            +
                "feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)"
         | 
| 302 | 
            +
               ]
         | 
| 303 | 
            +
              },
         | 
| 304 | 
            +
              {
         | 
| 305 | 
            +
               "cell_type": "markdown",
         | 
| 306 | 
            +
               "id": "93748af7-b917-4ecf-a0c8-7d89077ff9cb",
         | 
| 307 | 
            +
               "metadata": {
         | 
| 308 | 
            +
                "id": "93748af7-b917-4ecf-a0c8-7d89077ff9cb"
         | 
| 309 | 
            +
               },
         | 
| 310 | 
            +
               "source": [
         | 
| 311 | 
            +
                "### Load WhisperTokenizer"
         | 
| 312 | 
            +
               ]
         | 
| 313 | 
            +
              },
         | 
| 314 | 
            +
              {
         | 
| 315 | 
            +
               "attachments": {},
         | 
| 316 | 
            +
               "cell_type": "markdown",
         | 
| 317 | 
            +
               "id": "2bc82609-a9fb-447a-a2af-99597c864029",
         | 
| 318 | 
            +
               "metadata": {
         | 
| 319 | 
            +
                "id": "2bc82609-a9fb-447a-a2af-99597c864029"
         | 
| 320 | 
            +
               },
         | 
| 321 | 
            +
               "source": [
         | 
| 322 | 
            +
                "The Whisper model outputs a sequence of _token ids_. The tokenizer maps each of these token ids to their corresponding text string. For Arabic, we can load the pre-trained tokenizer and use it for fine-tuning without any further modifications. We simply have to \n",
         | 
| 323 | 
            +
                "specify the target language and the task. These arguments inform the \n",
         | 
| 324 | 
            +
                "tokenizer to prefix the language and task tokens to the start of encoded \n",
         | 
| 325 | 
            +
                "label sequences:"
         | 
| 326 | 
            +
               ]
         | 
| 327 | 
            +
              },
         | 
| 328 | 
            +
              {
         | 
| 329 | 
            +
               "cell_type": "code",
         | 
| 330 | 
            +
               "execution_count": null,
         | 
| 331 | 
            +
               "id": "c7b07f9b-ae0e-4f89-98f0-0c50d432eab6",
         | 
| 332 | 
            +
               "metadata": {
         | 
| 333 | 
            +
                "id": "c7b07f9b-ae0e-4f89-98f0-0c50d432eab6",
         | 
| 334 | 
            +
                "outputId": "5c004b44-86e7-4e00-88be-39e0af5eed69"
         | 
| 335 | 
            +
               },
         | 
| 336 | 
            +
               "outputs": [
         | 
| 337 | 
            +
                {
         | 
| 338 | 
            +
                 "data": {
         | 
| 339 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 340 | 
            +
                   "model_id": "90d056e20b3e4f14ae0199a1a4ab1bb0",
         | 
| 341 | 
            +
                   "version_major": 2,
         | 
| 342 | 
            +
                   "version_minor": 0
         | 
| 343 | 
            +
                  },
         | 
| 344 | 
            +
                  "text/plain": [
         | 
| 345 | 
            +
                   "Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]"
         | 
| 346 | 
            +
                  ]
         | 
| 347 | 
            +
                 },
         | 
| 348 | 
            +
                 "metadata": {},
         | 
| 349 | 
            +
                 "output_type": "display_data"
         | 
| 350 | 
            +
                },
         | 
| 351 | 
            +
                {
         | 
| 352 | 
            +
                 "data": {
         | 
| 353 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 354 | 
            +
                   "model_id": "d82a88daec0e4f14add691b7b903064c",
         | 
| 355 | 
            +
                   "version_major": 2,
         | 
| 356 | 
            +
                   "version_minor": 0
         | 
| 357 | 
            +
                  },
         | 
| 358 | 
            +
                  "text/plain": [
         | 
| 359 | 
            +
                   "Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
         | 
| 360 | 
            +
                  ]
         | 
| 361 | 
            +
                 },
         | 
| 362 | 
            +
                 "metadata": {},
         | 
| 363 | 
            +
                 "output_type": "display_data"
         | 
| 364 | 
            +
                },
         | 
| 365 | 
            +
                {
         | 
| 366 | 
            +
                 "data": {
         | 
| 367 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 368 | 
            +
                   "model_id": "350acdb0f40e454099fa901e66de55f0",
         | 
| 369 | 
            +
                   "version_major": 2,
         | 
| 370 | 
            +
                   "version_minor": 0
         | 
| 371 | 
            +
                  },
         | 
| 372 | 
            +
                  "text/plain": [
         | 
| 373 | 
            +
                   "Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]"
         | 
| 374 | 
            +
                  ]
         | 
| 375 | 
            +
                 },
         | 
| 376 | 
            +
                 "metadata": {},
         | 
| 377 | 
            +
                 "output_type": "display_data"
         | 
| 378 | 
            +
                },
         | 
| 379 | 
            +
                {
         | 
| 380 | 
            +
                 "data": {
         | 
| 381 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 382 | 
            +
                   "model_id": "2e6a82a462cc411d90fa1bea4ee60790",
         | 
| 383 | 
            +
                   "version_major": 2,
         | 
| 384 | 
            +
                   "version_minor": 0
         | 
| 385 | 
            +
                  },
         | 
| 386 | 
            +
                  "text/plain": [
         | 
| 387 | 
            +
                   "Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]"
         | 
| 388 | 
            +
                  ]
         | 
| 389 | 
            +
                 },
         | 
| 390 | 
            +
                 "metadata": {},
         | 
| 391 | 
            +
                 "output_type": "display_data"
         | 
| 392 | 
            +
                },
         | 
| 393 | 
            +
                {
         | 
| 394 | 
            +
                 "data": {
         | 
| 395 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 396 | 
            +
                   "model_id": "c74bfee0198b4817832ea86e8e88d96c",
         | 
| 397 | 
            +
                   "version_major": 2,
         | 
| 398 | 
            +
                   "version_minor": 0
         | 
| 399 | 
            +
                  },
         | 
| 400 | 
            +
                  "text/plain": [
         | 
| 401 | 
            +
                   "Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]"
         | 
| 402 | 
            +
                  ]
         | 
| 403 | 
            +
                 },
         | 
| 404 | 
            +
                 "metadata": {},
         | 
| 405 | 
            +
                 "output_type": "display_data"
         | 
| 406 | 
            +
                },
         | 
| 407 | 
            +
                {
         | 
| 408 | 
            +
                 "data": {
         | 
| 409 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 410 | 
            +
                   "model_id": "04fb2d81eff646068e10475a08ae42f4",
         | 
| 411 | 
            +
                   "version_major": 2,
         | 
| 412 | 
            +
                   "version_minor": 0
         | 
| 413 | 
            +
                  },
         | 
| 414 | 
            +
                  "text/plain": [
         | 
| 415 | 
            +
                   "Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]"
         | 
| 416 | 
            +
                  ]
         | 
| 417 | 
            +
                 },
         | 
| 418 | 
            +
                 "metadata": {},
         | 
| 419 | 
            +
                 "output_type": "display_data"
         | 
| 420 | 
            +
                }
         | 
| 421 | 
            +
               ],
         | 
| 422 | 
            +
               "source": [
         | 
| 423 | 
            +
                "from transformers import WhisperTokenizer\n",
         | 
| 424 | 
            +
                "\n",
         | 
| 425 | 
            +
                "tokenizer = WhisperTokenizer.from_pretrained(\"openai/whisper-small\", language=\"Arabic\", task=\"transcribe\")"
         | 
| 426 | 
            +
               ]
         | 
| 427 | 
            +
              },
         | 
| 428 | 
            +
              {
         | 
| 429 | 
            +
               "cell_type": "markdown",
         | 
| 430 | 
            +
               "id": "d2ef23f3-f4a8-483a-a2dc-080a7496cb1b",
         | 
| 431 | 
            +
               "metadata": {
         | 
| 432 | 
            +
                "id": "d2ef23f3-f4a8-483a-a2dc-080a7496cb1b"
         | 
| 433 | 
            +
               },
         | 
| 434 | 
            +
               "source": [
         | 
| 435 | 
            +
                "### Combine To Create A WhisperProcessor"
         | 
| 436 | 
            +
               ]
         | 
| 437 | 
            +
              },
         | 
| 438 | 
            +
              {
         | 
| 439 | 
            +
               "cell_type": "markdown",
         | 
| 440 | 
            +
               "id": "5ff67654-5a29-4bb8-a69d-0228946c6f8d",
         | 
| 441 | 
            +
               "metadata": {
         | 
| 442 | 
            +
                "id": "5ff67654-5a29-4bb8-a69d-0228946c6f8d"
         | 
| 443 | 
            +
               },
         | 
| 444 | 
            +
               "source": [
         | 
| 445 | 
            +
                "To simplify using the feature extractor and tokenizer, we can _wrap_ \n",
         | 
| 446 | 
            +
                "both into a single `WhisperProcessor` class. This processor object \n",
         | 
| 447 | 
            +
                "inherits from the `WhisperFeatureExtractor` and `WhisperProcessor`, \n",
         | 
| 448 | 
            +
                "and can be used on the audio inputs and model predictions as required. \n",
         | 
| 449 | 
            +
                "In doing so, we only need to keep track of two objects during training: \n",
         | 
| 450 | 
            +
                "the `processor` and the `model`:"
         | 
| 451 | 
            +
               ]
         | 
| 452 | 
            +
              },
         | 
| 453 | 
            +
              {
         | 
| 454 | 
            +
               "cell_type": "code",
         | 
| 455 | 
            +
               "execution_count": null,
         | 
| 456 | 
            +
               "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
         | 
| 457 | 
            +
               "metadata": {
         | 
| 458 | 
            +
                "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6"
         | 
| 459 | 
            +
               },
         | 
| 460 | 
            +
               "outputs": [],
         | 
| 461 | 
            +
               "source": [
         | 
| 462 | 
            +
                "from transformers import WhisperProcessor\n",
         | 
| 463 | 
            +
                "\n",
         | 
| 464 | 
            +
                "processor = WhisperProcessor.from_pretrained(model_name, language=\"Arabic\", task=\"transcribe\")"
         | 
| 465 | 
            +
               ]
         | 
| 466 | 
            +
              },
         | 
| 467 | 
            +
              {
         | 
| 468 | 
            +
               "cell_type": "markdown",
         | 
| 469 | 
            +
               "id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c",
         | 
| 470 | 
            +
               "metadata": {
         | 
| 471 | 
            +
                "id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c"
         | 
| 472 | 
            +
               },
         | 
| 473 | 
            +
               "source": [
         | 
| 474 | 
            +
                "### Prepare Data"
         | 
| 475 | 
            +
               ]
         | 
| 476 | 
            +
              },
         | 
| 477 | 
            +
              {
         | 
| 478 | 
            +
               "cell_type": "markdown",
         | 
| 479 | 
            +
               "id": "9649bf01-2e8a-45e5-8fca-441c13637b8f",
         | 
| 480 | 
            +
               "metadata": {
         | 
| 481 | 
            +
                "id": "9649bf01-2e8a-45e5-8fca-441c13637b8f"
         | 
| 482 | 
            +
               },
         | 
| 483 | 
            +
               "source": [
         | 
| 484 | 
            +
                "Let's print the first example of the Common Voice dataset to see \n",
         | 
| 485 | 
            +
                "what form the data is in:"
         | 
| 486 | 
            +
               ]
         | 
| 487 | 
            +
              },
         | 
| 488 | 
            +
              {
         | 
| 489 | 
            +
               "cell_type": "code",
         | 
| 490 | 
            +
               "execution_count": null,
         | 
| 491 | 
            +
               "id": "6e6b0ec5-0c94-4e2c-ae24-c791be1b2255",
         | 
| 492 | 
            +
               "metadata": {
         | 
| 493 | 
            +
                "id": "6e6b0ec5-0c94-4e2c-ae24-c791be1b2255"
         | 
| 494 | 
            +
               },
         | 
| 495 | 
            +
               "outputs": [],
         | 
| 496 | 
            +
               "source": [
         | 
| 497 | 
            +
                "print(common_voice[\"train\"][0])"
         | 
| 498 | 
            +
               ]
         | 
| 499 | 
            +
              },
         | 
| 500 | 
            +
              {
         | 
| 501 | 
            +
               "cell_type": "markdown",
         | 
| 502 | 
            +
               "id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd",
         | 
| 503 | 
            +
               "metadata": {
         | 
| 504 | 
            +
                "id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd"
         | 
| 505 | 
            +
               },
         | 
| 506 | 
            +
               "source": [
         | 
| 507 | 
            +
                "Since \n",
         | 
| 508 | 
            +
                "our input audio is sampled at 48kHz, we need to _downsample_ it to \n",
         | 
| 509 | 
            +
                "16kHz prior to passing it to the Whisper feature extractor, 16kHz being the sampling rate expected by the Whisper model. \n",
         | 
| 510 | 
            +
                "\n",
         | 
| 511 | 
            +
                "We'll set the audio inputs to the correct sampling rate using dataset's \n",
         | 
| 512 | 
            +
                "[`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=cast_column#datasets.DatasetDict.cast_column)\n",
         | 
| 513 | 
            +
                "method. This operation does not change the audio in-place, \n",
         | 
| 514 | 
            +
                "but rather signals to `datasets` to resample audio samples _on the fly_ the \n",
         | 
| 515 | 
            +
                "first time that they are loaded:"
         | 
| 516 | 
            +
               ]
         | 
| 517 | 
            +
              },
         | 
| 518 | 
            +
              {
         | 
| 519 | 
            +
               "cell_type": "code",
         | 
| 520 | 
            +
               "execution_count": null,
         | 
| 521 | 
            +
               "id": "f12e2e57-156f-417b-8cfb-69221cc198e8",
         | 
| 522 | 
            +
               "metadata": {
         | 
| 523 | 
            +
                "id": "f12e2e57-156f-417b-8cfb-69221cc198e8"
         | 
| 524 | 
            +
               },
         | 
| 525 | 
            +
               "outputs": [],
         | 
| 526 | 
            +
               "source": [
         | 
| 527 | 
            +
                "from datasets import Audio\n",
         | 
| 528 | 
            +
                "\n",
         | 
| 529 | 
            +
                "common_voice = common_voice.cast_column(\"audio\", Audio(sampling_rate=16000))"
         | 
| 530 | 
            +
               ]
         | 
| 531 | 
            +
              },
         | 
| 532 | 
            +
              {
         | 
| 533 | 
            +
               "cell_type": "markdown",
         | 
| 534 | 
            +
               "id": "00382a3e-abec-4cdd-a54c-d1aaa3ea4707",
         | 
| 535 | 
            +
               "metadata": {
         | 
| 536 | 
            +
                "id": "00382a3e-abec-4cdd-a54c-d1aaa3ea4707"
         | 
| 537 | 
            +
               },
         | 
| 538 | 
            +
               "source": [
         | 
| 539 | 
            +
                "Re-loading the first audio sample in the Common Voice dataset will resample \n",
         | 
| 540 | 
            +
                "it to the desired sampling rate:"
         | 
| 541 | 
            +
               ]
         | 
| 542 | 
            +
              },
         | 
| 543 | 
            +
              {
         | 
| 544 | 
            +
               "cell_type": "code",
         | 
| 545 | 
            +
               "execution_count": null,
         | 
| 546 | 
            +
               "id": "87122d71-289a-466a-afcf-fa354b18946b",
         | 
| 547 | 
            +
               "metadata": {
         | 
| 548 | 
            +
                "id": "87122d71-289a-466a-afcf-fa354b18946b"
         | 
| 549 | 
            +
               },
         | 
| 550 | 
            +
               "outputs": [],
         | 
| 551 | 
            +
               "source": [
         | 
| 552 | 
            +
                "print(common_voice[\"train\"][0])"
         | 
| 553 | 
            +
               ]
         | 
| 554 | 
            +
              },
         | 
| 555 | 
            +
              {
         | 
| 556 | 
            +
               "cell_type": "markdown",
         | 
| 557 | 
            +
               "id": "3df7378a-a4c0-45d7-8d07-defbd1062ab6",
         | 
| 558 | 
            +
               "metadata": {},
         | 
| 559 | 
            +
               "source": [
         | 
| 560 | 
            +
                "We'll define our pre-processing strategy. We advise that you **do not** lower-case the transcriptions or remove punctuation unless mixing different datasets. This will enable you to fine-tune Whisper models that can predict punctuation and casing. Later, you will see how we can evaluate the predictions without punctuation or casing, so that the models benefit from the WER improvement obtained by normalising the transcriptions while still predicting fully formatted transcriptions."
         | 
| 561 | 
            +
               ]
         | 
| 562 | 
            +
              },
         | 
| 563 | 
            +
              {
         | 
| 564 | 
            +
               "cell_type": "code",
         | 
| 565 | 
            +
               "execution_count": null,
         | 
| 566 | 
            +
               "id": "d041650e-1c48-4439-87b3-5b6f4a514107",
         | 
| 567 | 
            +
               "metadata": {},
         | 
| 568 | 
            +
               "outputs": [],
         | 
| 569 | 
            +
               "source": [
         | 
| 570 | 
            +
                "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
         | 
| 571 | 
            +
                "\n",
         | 
| 572 | 
            +
                "do_lower_case = False\n",
         | 
| 573 | 
            +
                "do_remove_punctuation = False\n",
         | 
| 574 | 
            +
                "\n",
         | 
| 575 | 
            +
                "normalizer = BasicTextNormalizer()"
         | 
| 576 | 
            +
               ]
         | 
| 577 | 
            +
              },
         | 
| 578 | 
            +
              {
         | 
| 579 | 
            +
               "cell_type": "markdown",
         | 
| 580 | 
            +
               "id": "89e12c2e-2f14-479b-987b-f0c75c881095",
         | 
| 581 | 
            +
               "metadata": {},
         | 
| 582 | 
            +
               "source": [
         | 
| 583 | 
            +
                "Now we can write a function to prepare our data ready for the model:\n",
         | 
| 584 | 
            +
                "1. We load and resample the audio data by calling `batch[\"audio\"]`. As explained above, 🤗 Datasets performs any necessary resampling operations on the fly.\n",
         | 
| 585 | 
            +
                "2. We use the feature extractor to compute the log-Mel spectrogram input features from our 1-dimensional audio array.\n",
         | 
| 586 | 
            +
                "3. We perform any optional pre-processing (lower-case or remove punctuation).\n",
         | 
| 587 | 
            +
                "4. We encode the transcriptions to label ids through the use of the tokenizer."
         | 
| 588 | 
            +
               ]
         | 
| 589 | 
            +
              },
         | 
| 590 | 
            +
              {
         | 
| 591 | 
            +
               "cell_type": "code",
         | 
| 592 | 
            +
               "execution_count": null,
         | 
| 593 | 
            +
               "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
         | 
| 594 | 
            +
               "metadata": {},
         | 
| 595 | 
            +
               "outputs": [],
         | 
| 596 | 
            +
               "source": [
         | 
| 597 | 
            +
                "def prepare_dataset(batch):\n",
         | 
| 598 | 
            +
                "    # load and (possibly) resample audio data to 16kHz\n",
         | 
| 599 | 
            +
                "    audio = batch[\"audio\"]\n",
         | 
| 600 | 
            +
                "\n",
         | 
| 601 | 
            +
                "    # compute log-Mel input features from input audio array \n",
         | 
| 602 | 
            +
                "    batch[\"input_features\"] = processor.feature_extractor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_features[0]\n",
         | 
| 603 | 
            +
                "    # compute input length of audio sample in seconds\n",
         | 
| 604 | 
            +
                "    batch[\"input_length\"] = len(audio[\"array\"]) / audio[\"sampling_rate\"]\n",
         | 
| 605 | 
            +
                "    \n",
         | 
| 606 | 
            +
                "    # optional pre-processing steps\n",
         | 
| 607 | 
            +
                "    transcription = batch[\"sentence\"]\n",
         | 
| 608 | 
            +
                "    if do_lower_case:\n",
         | 
| 609 | 
            +
                "        transcription = transcription.lower()\n",
         | 
| 610 | 
            +
                "    if do_remove_punctuation:\n",
         | 
| 611 | 
            +
                "        transcription = normalizer(transcription).strip()\n",
         | 
| 612 | 
            +
                "    \n",
         | 
| 613 | 
            +
                "    # encode target text to label ids\n",
         | 
| 614 | 
            +
                "    batch[\"labels\"] = processor.tokenizer(transcription).input_ids\n",
         | 
| 615 | 
            +
                "    return batch"
         | 
| 616 | 
            +
               ]
         | 
| 617 | 
            +
              },
         | 
| 618 | 
            +
              {
         | 
| 619 | 
            +
               "cell_type": "markdown",
         | 
| 620 | 
            +
               "id": "8c960965-9fb6-466f-9dbd-c9d43e71d9d0",
         | 
| 621 | 
            +
               "metadata": {
         | 
| 622 | 
            +
                "id": "70b319fb-2439-4ef6-a70d-a47bf41c4a13"
         | 
| 623 | 
            +
               },
         | 
| 624 | 
            +
               "source": [
         | 
| 625 | 
            +
                "We can apply the data preparation function to all of our training examples using dataset's `.map` method. The argument `num_proc` specifies how many CPU cores to use. Setting `num_proc` > 1 will enable multiprocessing. If the `.map` method hangs with multiprocessing, set `num_proc=1` and process the dataset sequentially."
         | 
| 626 | 
            +
               ]
         | 
| 627 | 
            +
              },
         | 
| 628 | 
            +
              {
         | 
| 629 | 
            +
               "cell_type": "code",
         | 
| 630 | 
            +
               "execution_count": null,
         | 
| 631 | 
            +
               "id": "7b73ab39-ffaf-4b9e-86e5-782963c6134b",
         | 
| 632 | 
            +
               "metadata": {
         | 
| 633 | 
            +
                "id": "7b73ab39-ffaf-4b9e-86e5-782963c6134b"
         | 
| 634 | 
            +
               },
         | 
| 635 | 
            +
               "outputs": [],
         | 
| 636 | 
            +
               "source": [
         | 
| 637 | 
            +
                "common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names[\"train\"], num_proc=2)"
         | 
| 638 | 
            +
               ]
         | 
| 639 | 
            +
              },
         | 
| 640 | 
            +
              {
         | 
| 641 | 
            +
               "cell_type": "markdown",
         | 
| 642 | 
            +
               "id": "54ce0fdb-7218-4a4d-b175-383980fec0df",
         | 
| 643 | 
            +
               "metadata": {},
         | 
| 644 | 
            +
               "source": [
         | 
| 645 | 
            +
                "Finally, we filter any training data with audio samples longer than 30s. These samples would otherwise be truncated by the Whisper feature-extractor which could affect the stability of training. We define a function that returns `True` for samples that are less than 30s, and `False` for those that are longer:"
         | 
| 646 | 
            +
               ]
         | 
| 647 | 
            +
              },
         | 
| 648 | 
            +
              {
         | 
| 649 | 
            +
               "cell_type": "code",
         | 
| 650 | 
            +
               "execution_count": null,
         | 
| 651 | 
            +
               "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
         | 
| 652 | 
            +
               "metadata": {},
         | 
| 653 | 
            +
               "outputs": [],
         | 
| 654 | 
            +
               "source": [
         | 
| 655 | 
            +
                "max_input_length = 30.0\n",
         | 
| 656 | 
            +
                "\n",
         | 
| 657 | 
            +
                "def is_audio_in_length_range(length):\n",
         | 
| 658 | 
            +
                "    return length < max_input_length"
         | 
| 659 | 
            +
               ]
         | 
| 660 | 
            +
              },
         | 
| 661 | 
            +
              {
         | 
| 662 | 
            +
               "cell_type": "markdown",
         | 
| 663 | 
            +
               "id": "30e676a8-7ca8-4850-8c5d-5b2b00d13fba",
         | 
| 664 | 
            +
               "metadata": {},
         | 
| 665 | 
            +
               "source": [
         | 
| 666 | 
            +
                "We apply our filter function to all samples of our training dataset through 🤗 Datasets' `.filter` method:"
         | 
| 667 | 
            +
               ]
         | 
| 668 | 
            +
              },
         | 
| 669 | 
            +
              {
         | 
| 670 | 
            +
               "cell_type": "code",
         | 
| 671 | 
            +
               "execution_count": null,
         | 
| 672 | 
            +
               "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
         | 
| 673 | 
            +
               "metadata": {},
         | 
| 674 | 
            +
               "outputs": [],
         | 
| 675 | 
            +
               "source": [
         | 
| 676 | 
            +
                "common_voice[\"train\"] = common_voice[\"train\"].filter(\n",
         | 
| 677 | 
            +
                "    is_audio_in_length_range,\n",
         | 
| 678 | 
            +
                "    input_columns=[\"input_length\"],\n",
         | 
| 679 | 
            +
                ")"
         | 
| 680 | 
            +
               ]
         | 
| 681 | 
            +
              },
         | 
| 682 | 
            +
              {
         | 
| 683 | 
            +
               "cell_type": "markdown",
         | 
| 684 | 
            +
               "id": "263a5a58-0239-4a25-b0df-c625fc9c5810",
         | 
| 685 | 
            +
               "metadata": {
         | 
| 686 | 
            +
                "id": "263a5a58-0239-4a25-b0df-c625fc9c5810"
         | 
| 687 | 
            +
               },
         | 
| 688 | 
            +
               "source": [
         | 
| 689 | 
            +
                "## Training and Evaluation"
         | 
| 690 | 
            +
               ]
         | 
| 691 | 
            +
              },
         | 
| 692 | 
            +
              {
         | 
| 693 | 
            +
               "attachments": {},
         | 
| 694 | 
            +
               "cell_type": "markdown",
         | 
| 695 | 
            +
               "id": "a693e768-c5a6-453f-89a1-b601dcf7daf7",
         | 
| 696 | 
            +
               "metadata": {
         | 
| 697 | 
            +
                "id": "a693e768-c5a6-453f-89a1-b601dcf7daf7"
         | 
| 698 | 
            +
               },
         | 
| 699 | 
            +
               "source": [
         | 
| 700 | 
            +
                "Now that we've prepared our data, we're ready to dive into the training pipeline. \n",
         | 
| 701 | 
            +
                "The [🤗 Trainer](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer)\n",
         | 
| 702 | 
            +
                "will do much of the heavy lifting for us. All we have to do is:\n",
         | 
| 703 | 
            +
                "\n",
         | 
| 704 | 
            +
                "- Define a data collator: the data collator takes our pre-processed data and prepares PyTorch tensors ready for the model.\n",
         | 
| 705 | 
            +
                "\n",
         | 
| 706 | 
            +
                "- Evaluation metrics: during evaluation, we want to evaluate the model using the [word error rate (WER)](https://huggingface.co/metrics/wer) metric. We need to define a `compute_metrics` function that handles this computation.\n",
         | 
| 707 | 
            +
                "\n",
         | 
| 708 | 
            +
                "- Load a pre-trained checkpoint: we need to load a pre-trained checkpoint and configure it correctly for training.\n",
         | 
| 709 | 
            +
                "\n",
         | 
| 710 | 
            +
                "- Define the training configuration: this will be used by the 🤗 Trainer to define the training schedule.\n",
         | 
| 711 | 
            +
                "\n",
         | 
| 712 | 
            +
                "Once we've fine-tuned the model, we will evaluate it on the test data to verify that we have correctly trained it \n",
         | 
| 713 | 
            +
                "to transcribe speech in Arabic."
         | 
| 714 | 
            +
               ]
         | 
| 715 | 
            +
              },
         | 
| 716 | 
            +
              {
         | 
| 717 | 
            +
               "cell_type": "markdown",
         | 
| 718 | 
            +
               "id": "8d230e6d-624c-400a-bbf5-fa660881df25",
         | 
| 719 | 
            +
               "metadata": {
         | 
| 720 | 
            +
                "id": "8d230e6d-624c-400a-bbf5-fa660881df25"
         | 
| 721 | 
            +
               },
         | 
| 722 | 
            +
               "source": [
         | 
| 723 | 
            +
                "### Define a Data Collator"
         | 
| 724 | 
            +
               ]
         | 
| 725 | 
            +
              },
         | 
| 726 | 
            +
              {
         | 
| 727 | 
            +
               "cell_type": "markdown",
         | 
| 728 | 
            +
               "id": "04def221-0637-4a69-b242-d3f0c1d0ee78",
         | 
| 729 | 
            +
               "metadata": {
         | 
| 730 | 
            +
                "id": "04def221-0637-4a69-b242-d3f0c1d0ee78"
         | 
| 731 | 
            +
               },
         | 
| 732 | 
            +
               "source": [
         | 
| 733 | 
            +
                "The data collator for a sequence-to-sequence speech model is unique in the sense that it \n",
         | 
| 734 | 
            +
                "treats the `input_features` and `labels` independently: the  `input_features` must be \n",
         | 
| 735 | 
            +
                "handled by the feature extractor and the `labels` by the tokenizer.\n",
         | 
| 736 | 
            +
                "\n",
         | 
| 737 | 
            +
                "The `input_features` are already padded to 30s and converted to a log-Mel spectrogram \n",
         | 
| 738 | 
            +
                "of fixed dimension by action of the feature extractor, so all we have to do is convert the `input_features`\n",
         | 
| 739 | 
            +
                "to batched PyTorch tensors. We do this using the feature extractor's `.pad` method with `return_tensors=pt`.\n",
         | 
| 740 | 
            +
                "\n",
         | 
| 741 | 
            +
                "The `labels` on the other hand are un-padded. We first pad the sequences\n",
         | 
| 742 | 
            +
                "to the maximum length in the batch using the tokenizer's `.pad` method. The padding tokens \n",
         | 
| 743 | 
            +
                "are then replaced by `-100` so that these tokens are **not** taken into account when \n",
         | 
| 744 | 
            +
                "computing the loss. We then cut the BOS token from the start of the label sequence as we \n",
         | 
| 745 | 
            +
                "append it later during training.\n",
         | 
| 746 | 
            +
                "\n",
         | 
| 747 | 
            +
                "We can leverage the `WhisperProcessor` we defined earlier to perform both the \n",
         | 
| 748 | 
            +
                "feature extractor and the tokenizer operations:"
         | 
| 749 | 
            +
               ]
         | 
| 750 | 
            +
              },
         | 
| 751 | 
            +
              {
         | 
| 752 | 
            +
               "cell_type": "code",
         | 
| 753 | 
            +
               "execution_count": null,
         | 
| 754 | 
            +
               "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5",
         | 
| 755 | 
            +
               "metadata": {
         | 
| 756 | 
            +
                "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5"
         | 
| 757 | 
            +
               },
         | 
| 758 | 
            +
               "outputs": [],
         | 
| 759 | 
            +
               "source": [
         | 
| 760 | 
            +
                "import torch\n",
         | 
| 761 | 
            +
                "\n",
         | 
| 762 | 
            +
                "from dataclasses import dataclass\n",
         | 
| 763 | 
            +
                "from typing import Any, Dict, List, Union\n",
         | 
| 764 | 
            +
                "\n",
         | 
| 765 | 
            +
                "@dataclass\n",
         | 
| 766 | 
            +
                "class DataCollatorSpeechSeq2SeqWithPadding:\n",
         | 
| 767 | 
            +
                "    processor: Any\n",
         | 
| 768 | 
            +
                "\n",
         | 
| 769 | 
            +
                "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
         | 
| 770 | 
            +
                "        # split inputs and labels since they have to be of different lengths and need different padding methods\n",
         | 
| 771 | 
            +
                "        # first treat the audio inputs by simply returning torch tensors\n",
         | 
| 772 | 
            +
                "        input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
         | 
| 773 | 
            +
                "        batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
         | 
| 774 | 
            +
                "\n",
         | 
| 775 | 
            +
                "        # get the tokenized label sequences\n",
         | 
| 776 | 
            +
                "        label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
         | 
| 777 | 
            +
                "        # pad the labels to max length\n",
         | 
| 778 | 
            +
                "        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
         | 
| 779 | 
            +
                "\n",
         | 
| 780 | 
            +
                "        # replace padding with -100 to ignore loss correctly\n",
         | 
| 781 | 
            +
                "        labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
         | 
| 782 | 
            +
                "\n",
         | 
| 783 | 
            +
                "        # if bos token is appended in previous tokenization step,\n",
         | 
| 784 | 
            +
                "        # cut bos token here as it's append later anyways\n",
         | 
| 785 | 
            +
                "        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
         | 
| 786 | 
            +
                "            labels = labels[:, 1:]\n",
         | 
| 787 | 
            +
                "\n",
         | 
| 788 | 
            +
                "        batch[\"labels\"] = labels\n",
         | 
| 789 | 
            +
                "\n",
         | 
| 790 | 
            +
                "        return batch"
         | 
| 791 | 
            +
               ]
         | 
| 792 | 
            +
              },
         | 
| 793 | 
            +
              {
         | 
| 794 | 
            +
               "cell_type": "markdown",
         | 
| 795 | 
            +
               "id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86",
         | 
| 796 | 
            +
               "metadata": {
         | 
| 797 | 
            +
                "id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86"
         | 
| 798 | 
            +
               },
         | 
| 799 | 
            +
               "source": [
         | 
| 800 | 
            +
                "Let's initialise the data collator we've just defined:"
         | 
| 801 | 
            +
               ]
         | 
| 802 | 
            +
              },
         | 
| 803 | 
            +
              {
         | 
| 804 | 
            +
               "cell_type": "code",
         | 
| 805 | 
            +
               "execution_count": null,
         | 
| 806 | 
            +
               "id": "fc834702-c0d3-4a96-b101-7b87be32bf42",
         | 
| 807 | 
            +
               "metadata": {
         | 
| 808 | 
            +
                "id": "fc834702-c0d3-4a96-b101-7b87be32bf42"
         | 
| 809 | 
            +
               },
         | 
| 810 | 
            +
               "outputs": [],
         | 
| 811 | 
            +
               "source": [
         | 
| 812 | 
            +
                "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)"
         | 
| 813 | 
            +
               ]
         | 
| 814 | 
            +
              },
         | 
| 815 | 
            +
              {
         | 
| 816 | 
            +
               "cell_type": "markdown",
         | 
| 817 | 
            +
               "id": "d62bb2ab-750a-45e7-82e9-61d6f4805698",
         | 
| 818 | 
            +
               "metadata": {
         | 
| 819 | 
            +
                "id": "d62bb2ab-750a-45e7-82e9-61d6f4805698"
         | 
| 820 | 
            +
               },
         | 
| 821 | 
            +
               "source": [
         | 
| 822 | 
            +
                "### Evaluation Metrics"
         | 
| 823 | 
            +
               ]
         | 
| 824 | 
            +
              },
         | 
| 825 | 
            +
              {
         | 
| 826 | 
            +
               "cell_type": "markdown",
         | 
| 827 | 
            +
               "id": "66fee1a7-a44c-461e-b047-c3917221572e",
         | 
| 828 | 
            +
               "metadata": {
         | 
| 829 | 
            +
                "id": "66fee1a7-a44c-461e-b047-c3917221572e"
         | 
| 830 | 
            +
               },
         | 
| 831 | 
            +
               "source": [
         | 
| 832 | 
            +
                "We'll use the word error rate (WER) metric, the 'de-facto' metric for assessing \n",
         | 
| 833 | 
            +
                "ASR systems. For more information, refer to the WER [docs](https://huggingface.co/metrics/wer). We'll load the WER metric from 🤗 Evaluate:"
         | 
| 834 | 
            +
               ]
         | 
| 835 | 
            +
              },
         | 
| 836 | 
            +
              {
         | 
| 837 | 
            +
               "cell_type": "code",
         | 
| 838 | 
            +
               "execution_count": null,
         | 
| 839 | 
            +
               "id": "b22b4011-f31f-4b57-b684-c52332f92890",
         | 
| 840 | 
            +
               "metadata": {
         | 
| 841 | 
            +
                "id": "b22b4011-f31f-4b57-b684-c52332f92890"
         | 
| 842 | 
            +
               },
         | 
| 843 | 
            +
               "outputs": [],
         | 
| 844 | 
            +
               "source": [
         | 
| 845 | 
            +
                "import evaluate\n",
         | 
| 846 | 
            +
                "\n",
         | 
| 847 | 
            +
                "metric = evaluate.load(\"wer\")"
         | 
| 848 | 
            +
               ]
         | 
| 849 | 
            +
              },
         | 
| 850 | 
            +
              {
         | 
| 851 | 
            +
               "cell_type": "markdown",
         | 
| 852 | 
            +
               "id": "4f32cab6-31f0-4cb9-af4c-40ba0f5fc508",
         | 
| 853 | 
            +
               "metadata": {
         | 
| 854 | 
            +
                "id": "4f32cab6-31f0-4cb9-af4c-40ba0f5fc508"
         | 
| 855 | 
            +
               },
         | 
| 856 | 
            +
               "source": [
         | 
| 857 | 
            +
                "We then simply have to define a function that takes our model \n",
         | 
| 858 | 
            +
                "predictions and returns the WER metric. This function, called\n",
         | 
| 859 | 
            +
                "`compute_metrics`, first replaces `-100` with the `pad_token_id`\n",
         | 
| 860 | 
            +
                "in the `label_ids` (undoing the step we applied in the \n",
         | 
| 861 | 
            +
                "data collator to ignore padded tokens correctly in the loss).\n",
         | 
| 862 | 
            +
                "It then decodes the predicted and label ids to strings. Finally,\n",
         | 
| 863 | 
            +
                "it computes the WER between the predictions and reference labels. \n",
         | 
| 864 | 
            +
                "Here, we have the option of evaluating with the 'normalised' transcriptions \n",
         | 
| 865 | 
            +
                "and predictions. We recommend you set this to `True` to benefit from the WER \n",
         | 
| 866 | 
            +
                "improvement obtained by normalising the transcriptions."
         | 
| 867 | 
            +
               ]
         | 
| 868 | 
            +
              },
         | 
| 869 | 
            +
              {
         | 
| 870 | 
            +
               "cell_type": "code",
         | 
| 871 | 
            +
               "execution_count": null,
         | 
| 872 | 
            +
               "id": "23959a70-22d0-4ffe-9fa1-72b61e75bb52",
         | 
| 873 | 
            +
               "metadata": {
         | 
| 874 | 
            +
                "id": "23959a70-22d0-4ffe-9fa1-72b61e75bb52"
         | 
| 875 | 
            +
               },
         | 
| 876 | 
            +
               "outputs": [],
         | 
| 877 | 
            +
               "source": [
         | 
| 878 | 
            +
                "# evaluate with the 'normalised' WER\n",
         | 
| 879 | 
            +
                "do_normalize_eval = True\n",
         | 
| 880 | 
            +
                "\n",
         | 
| 881 | 
            +
                "def compute_metrics(pred):\n",
         | 
| 882 | 
            +
                "    pred_ids = pred.predictions\n",
         | 
| 883 | 
            +
                "    label_ids = pred.label_ids\n",
         | 
| 884 | 
            +
                "\n",
         | 
| 885 | 
            +
                "    # replace -100 with the pad_token_id\n",
         | 
| 886 | 
            +
                "    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n",
         | 
| 887 | 
            +
                "\n",
         | 
| 888 | 
            +
                "    # we do not want to group tokens when computing the metrics\n",
         | 
| 889 | 
            +
                "    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n",
         | 
| 890 | 
            +
                "    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n",
         | 
| 891 | 
            +
                "\n",
         | 
| 892 | 
            +
                "    if do_normalize_eval:\n",
         | 
| 893 | 
            +
                "        pred_str = [normalizer(pred) for pred in pred_str]\n",
         | 
| 894 | 
            +
                "        label_str = [normalizer(label) for label in label_str]\n",
         | 
| 895 | 
            +
                "\n",
         | 
| 896 | 
            +
                "    wer = 100 * metric.compute(predictions=pred_str, references=label_str)\n",
         | 
| 897 | 
            +
                "\n",
         | 
| 898 | 
            +
                "    return {\"wer\": wer}"
         | 
| 899 | 
            +
               ]
         | 
| 900 | 
            +
              },
         | 
| 901 | 
            +
              {
         | 
| 902 | 
            +
               "cell_type": "markdown",
         | 
| 903 | 
            +
               "id": "daf2a825-6d9f-4a23-b145-c37c0039075b",
         | 
| 904 | 
            +
               "metadata": {
         | 
| 905 | 
            +
                "id": "daf2a825-6d9f-4a23-b145-c37c0039075b"
         | 
| 906 | 
            +
               },
         | 
| 907 | 
            +
               "source": [
         | 
| 908 | 
            +
                "### Load a Pre-Trained Checkpoint"
         | 
| 909 | 
            +
               ]
         | 
| 910 | 
            +
              },
         | 
| 911 | 
            +
              {
         | 
| 912 | 
            +
               "cell_type": "markdown",
         | 
| 913 | 
            +
               "id": "437a97fa-4864-476b-8abc-f28b8166cfa5",
         | 
| 914 | 
            +
               "metadata": {
         | 
| 915 | 
            +
                "id": "437a97fa-4864-476b-8abc-f28b8166cfa5"
         | 
| 916 | 
            +
               },
         | 
| 917 | 
            +
               "source": [
         | 
| 918 | 
            +
                "Now let's load the pre-trained Whisper `small` checkpoint. Again, this \n",
         | 
| 919 | 
            +
                "is trivial through use of 🤗 Transformers!"
         | 
| 920 | 
            +
               ]
         | 
| 921 | 
            +
              },
         | 
| 922 | 
            +
              {
         | 
| 923 | 
            +
               "cell_type": "code",
         | 
| 924 | 
            +
               "execution_count": null,
         | 
| 925 | 
            +
               "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
         | 
| 926 | 
            +
               "metadata": {
         | 
| 927 | 
            +
                "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f"
         | 
| 928 | 
            +
               },
         | 
| 929 | 
            +
               "outputs": [],
         | 
| 930 | 
            +
               "source": [
         | 
| 931 | 
            +
                "from transformers import WhisperForConditionalGeneration\n",
         | 
| 932 | 
            +
                "\n",
         | 
| 933 | 
            +
                "model = WhisperForConditionalGeneration.from_pretrained(model_name)"
         | 
| 934 | 
            +
               ]
         | 
| 935 | 
            +
              },
         | 
| 936 | 
            +
              {
         | 
| 937 | 
            +
               "cell_type": "markdown",
         | 
| 938 | 
            +
               "id": "a15ead5f-2277-4a39-937b-585c2497b2df",
         | 
| 939 | 
            +
               "metadata": {
         | 
| 940 | 
            +
                "id": "a15ead5f-2277-4a39-937b-585c2497b2df"
         | 
| 941 | 
            +
               },
         | 
| 942 | 
            +
               "source": [
         | 
| 943 | 
            +
                "Override generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)). Set `use_cache` to False since we're using gradient checkpointing, and the two are incompatible:"
         | 
| 944 | 
            +
               ]
         | 
| 945 | 
            +
              },
         | 
| 946 | 
            +
              {
         | 
| 947 | 
            +
               "cell_type": "code",
         | 
| 948 | 
            +
               "execution_count": null,
         | 
| 949 | 
            +
               "id": "62038ba3-88ed-4fce-84db-338f50dcd04f",
         | 
| 950 | 
            +
               "metadata": {
         | 
| 951 | 
            +
                "id": "62038ba3-88ed-4fce-84db-338f50dcd04f"
         | 
| 952 | 
            +
               },
         | 
| 953 | 
            +
               "outputs": [],
         | 
| 954 | 
            +
               "source": [
         | 
| 955 | 
            +
                "model.config.forced_decoder_ids = None\n",
         | 
| 956 | 
            +
                "model.config.suppress_tokens = []\n",
         | 
| 957 | 
            +
                "model.config.use_cache = False"
         | 
| 958 | 
            +
               ]
         | 
| 959 | 
            +
              },
         | 
| 960 | 
            +
              {
         | 
| 961 | 
            +
               "cell_type": "markdown",
         | 
| 962 | 
            +
               "id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06",
         | 
| 963 | 
            +
               "metadata": {
         | 
| 964 | 
            +
                "id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06"
         | 
| 965 | 
            +
               },
         | 
| 966 | 
            +
               "source": [
         | 
| 967 | 
            +
                "### Define the Training Configuration"
         | 
| 968 | 
            +
               ]
         | 
| 969 | 
            +
              },
         | 
| 970 | 
            +
              {
         | 
| 971 | 
            +
               "cell_type": "markdown",
         | 
| 972 | 
            +
               "id": "c21af1e9-0188-4134-ac82-defc7bdcc436",
         | 
| 973 | 
            +
               "metadata": {
         | 
| 974 | 
            +
                "id": "c21af1e9-0188-4134-ac82-defc7bdcc436"
         | 
| 975 | 
            +
               },
         | 
| 976 | 
            +
               "source": [
         | 
| 977 | 
            +
                "In the final step, we define all the parameters related to training. For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments)."
         | 
| 978 | 
            +
               ]
         | 
| 979 | 
            +
              },
         | 
| 980 | 
            +
              {
         | 
| 981 | 
            +
               "cell_type": "code",
         | 
| 982 | 
            +
               "execution_count": null,
         | 
| 983 | 
            +
               "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a",
         | 
| 984 | 
            +
               "metadata": {
         | 
| 985 | 
            +
                "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a"
         | 
| 986 | 
            +
               },
         | 
| 987 | 
            +
               "outputs": [],
         | 
| 988 | 
            +
               "source": [
         | 
| 989 | 
            +
                "from transformers import Seq2SeqTrainingArguments\n",
         | 
| 990 | 
            +
                "\n",
         | 
| 991 | 
            +
                "training_args = Seq2SeqTrainingArguments(\n",
         | 
| 992 | 
            +
                "    output_dir=\"./\",\n",
         | 
| 993 | 
            +
                "    per_device_train_batch_size=64,\n",
         | 
| 994 | 
            +
                "    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n",
         | 
| 995 | 
            +
                "    learning_rate=1e-5,\n",
         | 
| 996 | 
            +
                "    warmup_steps=500,\n",
         | 
| 997 | 
            +
                "    max_steps=5000,\n",
         | 
| 998 | 
            +
                "    gradient_checkpointing=True,\n",
         | 
| 999 | 
            +
                "    fp16=True,\n",
         | 
| 1000 | 
            +
                "    evaluation_strategy=\"steps\",\n",
         | 
| 1001 | 
            +
                "    per_device_eval_batch_size=8,\n",
         | 
| 1002 | 
            +
                "    predict_with_generate=True,\n",
         | 
| 1003 | 
            +
                "    generation_max_length=225,\n",
         | 
| 1004 | 
            +
                "    save_steps=1000,\n",
         | 
| 1005 | 
            +
                "    eval_steps=1000,\n",
         | 
| 1006 | 
            +
                "    logging_steps=25,\n",
         | 
| 1007 | 
            +
                "    report_to=[\"tensorboard\"],\n",
         | 
| 1008 | 
            +
                "    load_best_model_at_end=True,\n",
         | 
| 1009 | 
            +
                "    metric_for_best_model=\"wer\",\n",
         | 
| 1010 | 
            +
                "    greater_is_better=False,\n",
         | 
| 1011 | 
            +
                "    push_to_hub=True,\n",
         | 
| 1012 | 
            +
                ")"
         | 
| 1013 | 
            +
               ]
         | 
| 1014 | 
            +
              },
         | 
| 1015 | 
            +
              {
         | 
| 1016 | 
            +
               "cell_type": "markdown",
         | 
| 1017 | 
            +
               "id": "b3a944d8-3112-4552-82a0-be25988b3857",
         | 
| 1018 | 
            +
               "metadata": {
         | 
| 1019 | 
            +
                "id": "b3a944d8-3112-4552-82a0-be25988b3857"
         | 
| 1020 | 
            +
               },
         | 
| 1021 | 
            +
               "source": [
         | 
| 1022 | 
            +
                "**Note**: if one does not want to upload the model checkpoints to the Hub, \n",
         | 
| 1023 | 
            +
                "set `push_to_hub=False`."
         | 
| 1024 | 
            +
               ]
         | 
| 1025 | 
            +
              },
         | 
| 1026 | 
            +
              {
         | 
| 1027 | 
            +
               "cell_type": "markdown",
         | 
| 1028 | 
            +
               "id": "bac29114-d226-4f54-97cf-8718c9f94e1e",
         | 
| 1029 | 
            +
               "metadata": {
         | 
| 1030 | 
            +
                "id": "bac29114-d226-4f54-97cf-8718c9f94e1e"
         | 
| 1031 | 
            +
               },
         | 
| 1032 | 
            +
               "source": [
         | 
| 1033 | 
            +
                "We can forward the training arguments to the 🤗 Trainer along with our model,\n",
         | 
| 1034 | 
            +
                "dataset, data collator and `compute_metrics` function:"
         | 
| 1035 | 
            +
               ]
         | 
| 1036 | 
            +
              },
         | 
| 1037 | 
            +
              {
         | 
| 1038 | 
            +
               "cell_type": "code",
         | 
| 1039 | 
            +
               "execution_count": null,
         | 
| 1040 | 
            +
               "id": "d546d7fe-0543-479a-b708-2ebabec19493",
         | 
| 1041 | 
            +
               "metadata": {
         | 
| 1042 | 
            +
                "id": "d546d7fe-0543-479a-b708-2ebabec19493"
         | 
| 1043 | 
            +
               },
         | 
| 1044 | 
            +
               "outputs": [],
         | 
| 1045 | 
            +
               "source": [
         | 
| 1046 | 
            +
                "from transformers import Seq2SeqTrainer\n",
         | 
| 1047 | 
            +
                "\n",
         | 
| 1048 | 
            +
                "trainer = Seq2SeqTrainer(\n",
         | 
| 1049 | 
            +
                "    args=training_args,\n",
         | 
| 1050 | 
            +
                "    model=model,\n",
         | 
| 1051 | 
            +
                "    train_dataset=common_voice[\"train\"],\n",
         | 
| 1052 | 
            +
                "    eval_dataset=common_voice[\"test\"],\n",
         | 
| 1053 | 
            +
                "    data_collator=data_collator,\n",
         | 
| 1054 | 
            +
                "    compute_metrics=compute_metrics,\n",
         | 
| 1055 | 
            +
                "    tokenizer=processor.feature_extractor,\n",
         | 
| 1056 | 
            +
                ")"
         | 
| 1057 | 
            +
               ]
         | 
| 1058 | 
            +
              },
         | 
| 1059 | 
            +
              {
         | 
| 1060 | 
            +
               "cell_type": "markdown",
         | 
| 1061 | 
            +
               "id": "uOrRhDGtN5S4",
         | 
| 1062 | 
            +
               "metadata": {
         | 
| 1063 | 
            +
                "id": "uOrRhDGtN5S4"
         | 
| 1064 | 
            +
               },
         | 
| 1065 | 
            +
               "source": [
         | 
| 1066 | 
            +
                "We'll save the processor object once before starting training. Since the processor is not trainable, it won't change over the course of training:"
         | 
| 1067 | 
            +
               ]
         | 
| 1068 | 
            +
              },
         | 
| 1069 | 
            +
              {
         | 
| 1070 | 
            +
               "cell_type": "code",
         | 
| 1071 | 
            +
               "execution_count": null,
         | 
| 1072 | 
            +
               "id": "-2zQwMfEOBJq",
         | 
| 1073 | 
            +
               "metadata": {
         | 
| 1074 | 
            +
                "id": "-2zQwMfEOBJq"
         | 
| 1075 | 
            +
               },
         | 
| 1076 | 
            +
               "outputs": [],
         | 
| 1077 | 
            +
               "source": [
         | 
| 1078 | 
            +
                "processor.save_pretrained(training_args.output_dir)"
         | 
| 1079 | 
            +
               ]
         | 
| 1080 | 
            +
              },
         | 
| 1081 | 
            +
              {
         | 
| 1082 | 
            +
               "cell_type": "markdown",
         | 
| 1083 | 
            +
               "id": "7f404cf9-4345-468c-8196-4bd101d9bd51",
         | 
| 1084 | 
            +
               "metadata": {
         | 
| 1085 | 
            +
                "id": "7f404cf9-4345-468c-8196-4bd101d9bd51"
         | 
| 1086 | 
            +
               },
         | 
| 1087 | 
            +
               "source": [
         | 
| 1088 | 
            +
                "### Training"
         | 
| 1089 | 
            +
               ]
         | 
| 1090 | 
            +
              },
         | 
| 1091 | 
            +
              {
         | 
| 1092 | 
            +
               "cell_type": "markdown",
         | 
| 1093 | 
            +
               "id": "5e8b8d56-5a70-4f68-bd2e-f0752d0bd112",
         | 
| 1094 | 
            +
               "metadata": {
         | 
| 1095 | 
            +
                "id": "5e8b8d56-5a70-4f68-bd2e-f0752d0bd112"
         | 
| 1096 | 
            +
               },
         | 
| 1097 | 
            +
               "source": [
         | 
| 1098 | 
            +
                "Training will take approximately 5-10 hours depending on your GPU. The peak GPU memory for the given training configuration is approximately 36GB. \n",
         | 
| 1099 | 
            +
                "Depending on your GPU, it is possible that you will encounter a CUDA `\"out-of-memory\"` error when you launch training. \n",
         | 
| 1100 | 
            +
                "In this case, you can reduce the `per_device_train_batch_size` incrementally by factors of 2 \n",
         | 
| 1101 | 
            +
                "and employ [`gradient_accumulation_steps`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments.gradient_accumulation_steps)\n",
         | 
| 1102 | 
            +
                "to compensate.\n",
         | 
| 1103 | 
            +
                "\n",
         | 
| 1104 | 
            +
                "To launch training, simply execute:"
         | 
| 1105 | 
            +
               ]
         | 
| 1106 | 
            +
              },
         | 
| 1107 | 
            +
              {
         | 
| 1108 | 
            +
               "cell_type": "code",
         | 
| 1109 | 
            +
               "execution_count": null,
         | 
| 1110 | 
            +
               "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
         | 
| 1111 | 
            +
               "metadata": {
         | 
| 1112 | 
            +
                "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de"
         | 
| 1113 | 
            +
               },
         | 
| 1114 | 
            +
               "outputs": [],
         | 
| 1115 | 
            +
               "source": [
         | 
| 1116 | 
            +
                "trainer.train()"
         | 
| 1117 | 
            +
               ]
         | 
| 1118 | 
            +
              },
         | 
| 1119 | 
            +
              {
         | 
| 1120 | 
            +
               "cell_type": "markdown",
         | 
| 1121 | 
            +
               "id": "810ced54-7187-4a06-b2fe-ba6dcca94dc3",
         | 
| 1122 | 
            +
               "metadata": {
         | 
| 1123 | 
            +
                "id": "810ced54-7187-4a06-b2fe-ba6dcca94dc3"
         | 
| 1124 | 
            +
               },
         | 
| 1125 | 
            +
               "source": [
         | 
| 1126 | 
            +
                "We can label our checkpoint with the `whisper-event` tag on push by setting the appropriate key-word arguments (kwargs):"
         | 
| 1127 | 
            +
               ]
         | 
| 1128 | 
            +
              },
         | 
| 1129 | 
            +
              {
         | 
| 1130 | 
            +
               "cell_type": "code",
         | 
| 1131 | 
            +
               "execution_count": null,
         | 
| 1132 | 
            +
               "id": "c704f91e-241b-48c9-b8e0-f0da396a9663",
         | 
| 1133 | 
            +
               "metadata": {
         | 
| 1134 | 
            +
                "id": "c704f91e-241b-48c9-b8e0-f0da396a9663"
         | 
| 1135 | 
            +
               },
         | 
| 1136 | 
            +
               "outputs": [],
         | 
| 1137 | 
            +
               "source": [
         | 
| 1138 | 
            +
                "kwargs = {\n",
         | 
| 1139 | 
            +
                "    \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
         | 
| 1140 | 
            +
                "    \"dataset\": \"Common Voice 11.0\",  # a 'pretty' name for the training dataset\n",
         | 
| 1141 | 
            +
                "    \"language\": \"ar\",\n",
         | 
| 1142 | 
            +
                "    \"model_name\": \"Whisper Small ar - Zaid Alyafeai\",  # a 'pretty' name for your model\n",
         | 
| 1143 | 
            +
                "    \"finetuned_from\": model_name,\n",
         | 
| 1144 | 
            +
                "    \"tasks\": \"automatic-speech-recognition\",\n",
         | 
| 1145 | 
            +
                "    \"tags\": \"whisper-event\",\n",
         | 
| 1146 | 
            +
                "}"
         | 
| 1147 | 
            +
               ]
         | 
| 1148 | 
            +
              },
         | 
| 1149 | 
            +
              {
         | 
| 1150 | 
            +
               "cell_type": "markdown",
         | 
| 1151 | 
            +
               "id": "090d676a-f944-4297-a938-a40eda0b2b68",
         | 
| 1152 | 
            +
               "metadata": {
         | 
| 1153 | 
            +
                "id": "090d676a-f944-4297-a938-a40eda0b2b68"
         | 
| 1154 | 
            +
               },
         | 
| 1155 | 
            +
               "source": [
         | 
| 1156 | 
            +
                "The training results can now be uploaded to the Hub. To do so, execute the `push_to_hub` command and save the preprocessor object we created:"
         | 
| 1157 | 
            +
               ]
         | 
| 1158 | 
            +
              },
         | 
| 1159 | 
            +
              {
         | 
| 1160 | 
            +
               "cell_type": "code",
         | 
| 1161 | 
            +
               "execution_count": null,
         | 
| 1162 | 
            +
               "id": "d7030622-caf7-4039-939b-6195cdaa2585",
         | 
| 1163 | 
            +
               "metadata": {
         | 
| 1164 | 
            +
                "id": "d7030622-caf7-4039-939b-6195cdaa2585"
         | 
| 1165 | 
            +
               },
         | 
| 1166 | 
            +
               "outputs": [],
         | 
| 1167 | 
            +
               "source": [
         | 
| 1168 | 
            +
                "trainer.push_to_hub(**kwargs)"
         | 
| 1169 | 
            +
               ]
         | 
| 1170 | 
            +
              },
         | 
| 1171 | 
            +
              {
         | 
| 1172 | 
            +
               "cell_type": "markdown",
         | 
| 1173 | 
            +
               "id": "ca743fbd-602c-48d4-ba8d-a2fe60af64ba",
         | 
| 1174 | 
            +
               "metadata": {
         | 
| 1175 | 
            +
                "id": "ca743fbd-602c-48d4-ba8d-a2fe60af64ba"
         | 
| 1176 | 
            +
               },
         | 
| 1177 | 
            +
               "source": [
         | 
| 1178 | 
            +
                "## Closing Remarks"
         | 
| 1179 | 
            +
               ]
         | 
| 1180 | 
            +
              },
         | 
| 1181 | 
            +
              {
         | 
| 1182 | 
            +
               "cell_type": "markdown",
         | 
| 1183 | 
            +
               "id": "7f737783-2870-4e35-aa11-86a42d7d997a",
         | 
| 1184 | 
            +
               "metadata": {
         | 
| 1185 | 
            +
                "id": "7f737783-2870-4e35-aa11-86a42d7d997a"
         | 
| 1186 | 
            +
               },
         | 
| 1187 | 
            +
               "source": [
         | 
| 1188 | 
            +
                "In this blog, we covered a step-by-step guide on fine-tuning Whisper for multilingual ASR \n",
         | 
| 1189 | 
            +
                "using 🤗 Datasets, Transformers and the Hugging Face Hub. For more details on the Whisper model, the Common Voice dataset and the theory behind fine-tuning, refere to the accompanying [blog post](https://huggingface.co/blog/fine-tune-whisper). If you're interested in fine-tuning other \n",
         | 
| 1190 | 
            +
                "Transformers models, both for English and multilingual ASR, be sure to check out the \n",
         | 
| 1191 | 
            +
                "examples scripts at [examples/pytorch/speech-recognition](https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-recognition)."
         | 
| 1192 | 
            +
               ]
         | 
| 1193 | 
            +
              }
         | 
| 1194 | 
            +
             ],
         | 
| 1195 | 
            +
             "metadata": {
         | 
| 1196 | 
            +
              "colab": {
         | 
| 1197 | 
            +
               "include_colab_link": true,
         | 
| 1198 | 
            +
               "provenance": []
         | 
| 1199 | 
            +
              },
         | 
| 1200 | 
            +
              "kernelspec": {
         | 
| 1201 | 
            +
               "display_name": "Python 3",
         | 
| 1202 | 
            +
               "language": "python",
         | 
| 1203 | 
            +
               "name": "python3"
         | 
| 1204 | 
            +
              },
         | 
| 1205 | 
            +
              "language_info": {
         | 
| 1206 | 
            +
               "codemirror_mode": {
         | 
| 1207 | 
            +
                "name": "ipython",
         | 
| 1208 | 
            +
                "version": 3
         | 
| 1209 | 
            +
               },
         | 
| 1210 | 
            +
               "file_extension": ".py",
         | 
| 1211 | 
            +
               "mimetype": "text/x-python",
         | 
| 1212 | 
            +
               "name": "python",
         | 
| 1213 | 
            +
               "nbconvert_exporter": "python",
         | 
| 1214 | 
            +
               "pygments_lexer": "ipython3",
         | 
| 1215 | 
            +
               "version": "3.9.5 (default, Nov 23 2021, 15:27:38) \n[GCC 9.3.0]"
         | 
| 1216 | 
            +
              },
         | 
| 1217 | 
            +
              "vscode": {
         | 
| 1218 | 
            +
               "interpreter": {
         | 
| 1219 | 
            +
                "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
         | 
| 1220 | 
            +
               }
         | 
| 1221 | 
            +
              }
         | 
| 1222 | 
            +
             },
         | 
| 1223 | 
            +
             "nbformat": 4,
         | 
| 1224 | 
            +
             "nbformat_minor": 5
         | 
| 1225 | 
            +
            }
         | 
    	
        fine-tune-whisper-non-streaming.ipynb.1
    ADDED
    
    | @@ -0,0 +1,1207 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
             "cells": [
         | 
| 3 | 
            +
              {
         | 
| 4 | 
            +
               "cell_type": "markdown",
         | 
| 5 | 
            +
               "id": "75b58048-7d14-4fc6-8085-1fc08c81b4a6",
         | 
| 6 | 
            +
               "metadata": {
         | 
| 7 | 
            +
                "id": "75b58048-7d14-4fc6-8085-1fc08c81b4a6"
         | 
| 8 | 
            +
               },
         | 
| 9 | 
            +
               "source": [
         | 
| 10 | 
            +
                "# Fine-Tune Whisper For Multilingual ASR with 🤗 Transformers"
         | 
| 11 | 
            +
               ]
         | 
| 12 | 
            +
              },
         | 
| 13 | 
            +
              {
         | 
| 14 | 
            +
               "cell_type": "markdown",
         | 
| 15 | 
            +
               "id": "fbfa8ad5-4cdc-4512-9058-836cbbf65e1a",
         | 
| 16 | 
            +
               "metadata": {
         | 
| 17 | 
            +
                "id": "fbfa8ad5-4cdc-4512-9058-836cbbf65e1a"
         | 
| 18 | 
            +
               },
         | 
| 19 | 
            +
               "source": [
         | 
| 20 | 
            +
                "In this Colab, we present a step-by-step guide on how to fine-tune Whisper \n",
         | 
| 21 | 
            +
                "for any multilingual ASR dataset using Hugging Face 🤗 Transformers. This is a \n",
         | 
| 22 | 
            +
                "more \"hands-on\" version of the accompanying [blog post](https://huggingface.co/blog/fine-tune-whisper). \n",
         | 
| 23 | 
            +
                "For a more in-depth explanation of Whisper, the Common Voice dataset and the theory behind fine-tuning, the reader is advised to refer to the blog post."
         | 
| 24 | 
            +
               ]
         | 
| 25 | 
            +
              },
         | 
| 26 | 
            +
              {
         | 
| 27 | 
            +
               "cell_type": "markdown",
         | 
| 28 | 
            +
               "id": "afe0d503-ae4e-4aa7-9af4-dbcba52db41e",
         | 
| 29 | 
            +
               "metadata": {
         | 
| 30 | 
            +
                "id": "afe0d503-ae4e-4aa7-9af4-dbcba52db41e"
         | 
| 31 | 
            +
               },
         | 
| 32 | 
            +
               "source": [
         | 
| 33 | 
            +
                "## Introduction"
         | 
| 34 | 
            +
               ]
         | 
| 35 | 
            +
              },
         | 
| 36 | 
            +
              {
         | 
| 37 | 
            +
               "cell_type": "markdown",
         | 
| 38 | 
            +
               "id": "9ae91ed4-9c3e-4ade-938e-f4c2dcfbfdc0",
         | 
| 39 | 
            +
               "metadata": {
         | 
| 40 | 
            +
                "id": "9ae91ed4-9c3e-4ade-938e-f4c2dcfbfdc0"
         | 
| 41 | 
            +
               },
         | 
| 42 | 
            +
               "source": [
         | 
| 43 | 
            +
                "Whisper is a pre-trained model for automatic speech recognition (ASR) \n",
         | 
| 44 | 
            +
                "published in [September 2022](https://openai.com/blog/whisper/) by the authors \n",
         | 
| 45 | 
            +
                "Alec Radford et al. from OpenAI. Unlike many of its predecessors, such as \n",
         | 
| 46 | 
            +
                "[Wav2Vec 2.0](https://arxiv.org/abs/2006.11477), which are pre-trained \n",
         | 
| 47 | 
            +
                "on un-labelled audio data, Whisper is pre-trained on a vast quantity of \n",
         | 
| 48 | 
            +
                "**labelled** audio-transcription data, 680,000 hours to be precise. \n",
         | 
| 49 | 
            +
                "This is an order of magnitude more data than the un-labelled audio data used \n",
         | 
| 50 | 
            +
                "to train Wav2Vec 2.0 (60,000 hours). What is more, 117,000 hours of this \n",
         | 
| 51 | 
            +
                "pre-training data is multilingual ASR data. This results in checkpoints \n",
         | 
| 52 | 
            +
                "that can be applied to over 96 languages, many of which are considered \n",
         | 
| 53 | 
            +
                "_low-resource_.\n",
         | 
| 54 | 
            +
                "\n",
         | 
| 55 | 
            +
                "When scaled to 680,000 hours of labelled pre-training data, Whisper models \n",
         | 
| 56 | 
            +
                "demonstrate a strong ability to generalise to many datasets and domains.\n",
         | 
| 57 | 
            +
                "The pre-trained checkpoints achieve competitive results to state-of-the-art \n",
         | 
| 58 | 
            +
                "ASR systems, with near 3% word error rate (WER) on the test-clean subset of \n",
         | 
| 59 | 
            +
                "LibriSpeech ASR and a new state-of-the-art on TED-LIUM with 4.7% WER (_c.f._ \n",
         | 
| 60 | 
            +
                "Table 8 of the [Whisper paper](https://cdn.openai.com/papers/whisper.pdf)).\n",
         | 
| 61 | 
            +
                "The extensive multilingual ASR knowledge acquired by Whisper during pre-training \n",
         | 
| 62 | 
            +
                "can be leveraged for other low-resource languages; through fine-tuning, the \n",
         | 
| 63 | 
            +
                "pre-trained checkpoints can be adapted for specific datasets and languages \n",
         | 
| 64 | 
            +
                "to further improve upon these results. We'll show just how Whisper can be fine-tuned \n",
         | 
| 65 | 
            +
                "for low-resource languages in this Colab."
         | 
| 66 | 
            +
               ]
         | 
| 67 | 
            +
              },
         | 
| 68 | 
            +
              {
         | 
| 69 | 
            +
               "cell_type": "markdown",
         | 
| 70 | 
            +
               "id": "e59b91d6-be24-4b5e-bb38-4977ea143a72",
         | 
| 71 | 
            +
               "metadata": {
         | 
| 72 | 
            +
                "id": "e59b91d6-be24-4b5e-bb38-4977ea143a72"
         | 
| 73 | 
            +
               },
         | 
| 74 | 
            +
               "source": [
         | 
| 75 | 
            +
                "<figure>\n",
         | 
| 76 | 
            +
                "<img src=\"https://raw.githubusercontent.com/sanchit-gandhi/notebooks/main/whisper_architecture.svg\" alt=\"Trulli\" style=\"width:100%\">\n",
         | 
| 77 | 
            +
                "<figcaption align = \"center\"><b>Figure 1:</b> Whisper model. The architecture \n",
         | 
| 78 | 
            +
                "follows the standard Transformer-based encoder-decoder model. A \n",
         | 
| 79 | 
            +
                "log-Mel spectrogram is input to the encoder. The last encoder \n",
         | 
| 80 | 
            +
                "hidden states are input to the decoder via cross-attention mechanisms. The \n",
         | 
| 81 | 
            +
                "decoder autoregressively predicts text tokens, jointly conditional on the \n",
         | 
| 82 | 
            +
                "encoder hidden states and previously predicted tokens. Figure source: \n",
         | 
| 83 | 
            +
                "<a href=\"https://openai.com/blog/whisper/\">OpenAI Whisper Blog</a>.</figcaption>\n",
         | 
| 84 | 
            +
                "</figure>"
         | 
| 85 | 
            +
               ]
         | 
| 86 | 
            +
              },
         | 
| 87 | 
            +
              {
         | 
| 88 | 
            +
               "cell_type": "markdown",
         | 
| 89 | 
            +
               "id": "21b6316e-8a55-4549-a154-66d3da2ab74a",
         | 
| 90 | 
            +
               "metadata": {
         | 
| 91 | 
            +
                "id": "21b6316e-8a55-4549-a154-66d3da2ab74a"
         | 
| 92 | 
            +
               },
         | 
| 93 | 
            +
               "source": [
         | 
| 94 | 
            +
                "The Whisper checkpoints come in five configurations of varying model sizes.\n",
         | 
| 95 | 
            +
                "The smallest four are trained on either English-only or multilingual data.\n",
         | 
| 96 | 
            +
                "The largest checkpoint is multilingual only. All nine of the pre-trained checkpoints \n",
         | 
| 97 | 
            +
                "are available on the [Hugging Face Hub](https://huggingface.co/models?search=openai/whisper). The \n",
         | 
| 98 | 
            +
                "checkpoints are summarised in the following table with links to the models on the Hub:\n",
         | 
| 99 | 
            +
                "\n",
         | 
| 100 | 
            +
                "| Size   | Layers | Width | Heads | Parameters | English-only                                         | Multilingual                                      |\n",
         | 
| 101 | 
            +
                "|--------|--------|-------|-------|------------|------------------------------------------------------|---------------------------------------------------|\n",
         | 
| 102 | 
            +
                "| tiny   | 4      | 384   | 6     | 39 M       | [✓](https://huggingface.co/openai/whisper-tiny.en)   | [✓](https://huggingface.co/openai/whisper-tiny.)  |\n",
         | 
| 103 | 
            +
                "| base   | 6      | 512   | 8     | 74 M       | [✓](https://huggingface.co/openai/whisper-base.en)   | [✓](https://huggingface.co/openai/whisper-base)   |\n",
         | 
| 104 | 
            +
                "| small  | 12     | 768   | 12    | 244 M      | [✓](https://huggingface.co/openai/whisper-small.en)  | [✓](https://huggingface.co/openai/whisper-small)  |\n",
         | 
| 105 | 
            +
                "| medium | 24     | 1024  | 16    | 769 M      | [✓](https://huggingface.co/openai/whisper-medium.en) | [✓](https://huggingface.co/openai/whisper-medium) |\n",
         | 
| 106 | 
            +
                "| large  | 32     | 1280  | 20    | 1550 M     | x                                                    | [✓](https://huggingface.co/openai/whisper-large)  |\n",
         | 
| 107 | 
            +
                "\n",
         | 
| 108 | 
            +
                "For demonstration purposes, we'll fine-tune the multilingual version of the \n",
         | 
| 109 | 
            +
                "[`\"small\"`](https://huggingface.co/openai/whisper-small) checkpoint with 244M params (~= 1GB). \n",
         | 
| 110 | 
            +
                "As for our data, we'll train and evaluate our system on a low-resource language \n",
         | 
| 111 | 
            +
                "taken from the [Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0)\n",
         | 
| 112 | 
            +
                "dataset. We'll show that with as little as 8 hours of fine-tuning data, we can achieve \n",
         | 
| 113 | 
            +
                "strong performance in this language."
         | 
| 114 | 
            +
               ]
         | 
| 115 | 
            +
              },
         | 
| 116 | 
            +
              {
         | 
| 117 | 
            +
               "cell_type": "markdown",
         | 
| 118 | 
            +
               "id": "3a680dfc-cbba-4f6c-8a1f-e1a5ff3f123a",
         | 
| 119 | 
            +
               "metadata": {
         | 
| 120 | 
            +
                "id": "3a680dfc-cbba-4f6c-8a1f-e1a5ff3f123a"
         | 
| 121 | 
            +
               },
         | 
| 122 | 
            +
               "source": [
         | 
| 123 | 
            +
                "------------------------------------------------------------------------\n",
         | 
| 124 | 
            +
                "\n",
         | 
| 125 | 
            +
                "\\\\({}^1\\\\) The name Whisper follows from the acronym “WSPSR”, which stands for “Web-scale Supervised Pre-training for Speech Recognition”."
         | 
| 126 | 
            +
               ]
         | 
| 127 | 
            +
              },
         | 
| 128 | 
            +
              {
         | 
| 129 | 
            +
               "cell_type": "markdown",
         | 
| 130 | 
            +
               "id": "b219c9dd-39b6-4a95-b2a1-3f547a1e7bc0",
         | 
| 131 | 
            +
               "metadata": {
         | 
| 132 | 
            +
                "id": "b219c9dd-39b6-4a95-b2a1-3f547a1e7bc0"
         | 
| 133 | 
            +
               },
         | 
| 134 | 
            +
               "source": [
         | 
| 135 | 
            +
                "## Load Dataset"
         | 
| 136 | 
            +
               ]
         | 
| 137 | 
            +
              },
         | 
| 138 | 
            +
              {
         | 
| 139 | 
            +
               "cell_type": "markdown",
         | 
| 140 | 
            +
               "id": "674429c5-0ab4-4adf-975b-621bb69eca38",
         | 
| 141 | 
            +
               "metadata": {
         | 
| 142 | 
            +
                "id": "674429c5-0ab4-4adf-975b-621bb69eca38"
         | 
| 143 | 
            +
               },
         | 
| 144 | 
            +
               "source": [
         | 
| 145 | 
            +
                "Using 🤗 Datasets, downloading and preparing data is extremely simple. \n",
         | 
| 146 | 
            +
                "We can download and prepare the Common Voice splits in just one line of code. \n",
         | 
| 147 | 
            +
                "\n",
         | 
| 148 | 
            +
                "First, ensure you have accepted the terms of use on the Hugging Face Hub: [mozilla-foundation/common_voice_11_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0). Once you have accepted the terms, you will have full access to the dataset and be able to download the data locally.\n",
         | 
| 149 | 
            +
                "\n",
         | 
| 150 | 
            +
                "Since Hindi is very low-resource, we'll combine the `train` and `validation` \n",
         | 
| 151 | 
            +
                "splits to give approximately 8 hours of training data. We'll use the 4 hours \n",
         | 
| 152 | 
            +
                "of `test` data as our held-out test set:"
         | 
| 153 | 
            +
               ]
         | 
| 154 | 
            +
              },
         | 
| 155 | 
            +
              {
         | 
| 156 | 
            +
               "cell_type": "code",
         | 
| 157 | 
            +
               "execution_count": null,
         | 
| 158 | 
            +
               "id": "a2787582-554f-44ce-9f38-4180a5ed6b44",
         | 
| 159 | 
            +
               "metadata": {
         | 
| 160 | 
            +
                "id": "a2787582-554f-44ce-9f38-4180a5ed6b44"
         | 
| 161 | 
            +
               },
         | 
| 162 | 
            +
               "outputs": [],
         | 
| 163 | 
            +
               "source": [
         | 
| 164 | 
            +
                "from datasets import load_dataset, DatasetDict\n",
         | 
| 165 | 
            +
                "\n",
         | 
| 166 | 
            +
                "common_voice = DatasetDict()\n",
         | 
| 167 | 
            +
                "\n",
         | 
| 168 | 
            +
                "common_voice[\"train\"] = load_dataset(\"mozilla-foundation/common_voice_11_0\", \"hi\", split=\"train+validation\", use_auth_token=True)\n",
         | 
| 169 | 
            +
                "common_voice[\"test\"] = load_dataset(\"mozilla-foundation/common_voice_11_0\", \"hi\", split=\"test\", use_auth_token=True)\n",
         | 
| 170 | 
            +
                "\n",
         | 
| 171 | 
            +
                "print(common_voice)"
         | 
| 172 | 
            +
               ]
         | 
| 173 | 
            +
              },
         | 
| 174 | 
            +
              {
         | 
| 175 | 
            +
               "cell_type": "markdown",
         | 
| 176 | 
            +
               "id": "d5c7c3d6-7197-41e7-a088-49b753c1681f",
         | 
| 177 | 
            +
               "metadata": {
         | 
| 178 | 
            +
                "id": "d5c7c3d6-7197-41e7-a088-49b753c1681f"
         | 
| 179 | 
            +
               },
         | 
| 180 | 
            +
               "source": [
         | 
| 181 | 
            +
                "Most ASR datasets only provide input audio samples (`audio`) and the \n",
         | 
| 182 | 
            +
                "corresponding transcribed text (`sentence`). Common Voice contains additional \n",
         | 
| 183 | 
            +
                "metadata information, such as `accent` and `locale`, which we can disregard for ASR.\n",
         | 
| 184 | 
            +
                "Keeping the notebook as general as possible, we only consider the input audio and\n",
         | 
| 185 | 
            +
                "transcribed text for fine-tuning, discarding the additional metadata information:"
         | 
| 186 | 
            +
               ]
         | 
| 187 | 
            +
              },
         | 
| 188 | 
            +
              {
         | 
| 189 | 
            +
               "cell_type": "code",
         | 
| 190 | 
            +
               "execution_count": null,
         | 
| 191 | 
            +
               "id": "20ba635d-518c-47ac-97ee-3cad25f1e0ce",
         | 
| 192 | 
            +
               "metadata": {
         | 
| 193 | 
            +
                "id": "20ba635d-518c-47ac-97ee-3cad25f1e0ce"
         | 
| 194 | 
            +
               },
         | 
| 195 | 
            +
               "outputs": [],
         | 
| 196 | 
            +
               "source": [
         | 
| 197 | 
            +
                "common_voice = common_voice.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"path\", \"segment\", \"up_votes\"])\n",
         | 
| 198 | 
            +
                "\n",
         | 
| 199 | 
            +
                "print(common_voice)"
         | 
| 200 | 
            +
               ]
         | 
| 201 | 
            +
              },
         | 
| 202 | 
            +
              {
         | 
| 203 | 
            +
               "cell_type": "markdown",
         | 
| 204 | 
            +
               "id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605",
         | 
| 205 | 
            +
               "metadata": {
         | 
| 206 | 
            +
                "id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605"
         | 
| 207 | 
            +
               },
         | 
| 208 | 
            +
               "source": [
         | 
| 209 | 
            +
                "## Prepare Feature Extractor, Tokenizer and Data"
         | 
| 210 | 
            +
               ]
         | 
| 211 | 
            +
              },
         | 
| 212 | 
            +
              {
         | 
| 213 | 
            +
               "cell_type": "markdown",
         | 
| 214 | 
            +
               "id": "601c3099-1026-439e-93e2-5635b3ba5a73",
         | 
| 215 | 
            +
               "metadata": {
         | 
| 216 | 
            +
                "id": "601c3099-1026-439e-93e2-5635b3ba5a73"
         | 
| 217 | 
            +
               },
         | 
| 218 | 
            +
               "source": [
         | 
| 219 | 
            +
                "The ASR pipeline can be de-composed into three stages: \n",
         | 
| 220 | 
            +
                "1) A feature extractor which pre-processes the raw audio-inputs\n",
         | 
| 221 | 
            +
                "2) The model which performs the sequence-to-sequence mapping \n",
         | 
| 222 | 
            +
                "3) A tokenizer which post-processes the model outputs to text format\n",
         | 
| 223 | 
            +
                "\n",
         | 
| 224 | 
            +
                "In 🤗 Transformers, the Whisper model has an associated feature extractor and tokenizer, \n",
         | 
| 225 | 
            +
                "called [WhisperFeatureExtractor](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperFeatureExtractor)\n",
         | 
| 226 | 
            +
                "and [WhisperTokenizer](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperTokenizer) \n",
         | 
| 227 | 
            +
                "respectively.\n",
         | 
| 228 | 
            +
                "\n",
         | 
| 229 | 
            +
                "We'll go through details for setting-up the feature extractor and tokenizer one-by-one!"
         | 
| 230 | 
            +
               ]
         | 
| 231 | 
            +
              },
         | 
| 232 | 
            +
              {
         | 
| 233 | 
            +
               "cell_type": "markdown",
         | 
| 234 | 
            +
               "id": "560332eb-3558-41a1-b500-e83a9f695f84",
         | 
| 235 | 
            +
               "metadata": {
         | 
| 236 | 
            +
                "id": "560332eb-3558-41a1-b500-e83a9f695f84"
         | 
| 237 | 
            +
               },
         | 
| 238 | 
            +
               "source": [
         | 
| 239 | 
            +
                "### Load WhisperFeatureExtractor"
         | 
| 240 | 
            +
               ]
         | 
| 241 | 
            +
              },
         | 
| 242 | 
            +
              {
         | 
| 243 | 
            +
               "cell_type": "markdown",
         | 
| 244 | 
            +
               "id": "32ec8068-0bd7-412d-b662-0edb9d1e7365",
         | 
| 245 | 
            +
               "metadata": {
         | 
| 246 | 
            +
                "id": "32ec8068-0bd7-412d-b662-0edb9d1e7365"
         | 
| 247 | 
            +
               },
         | 
| 248 | 
            +
               "source": [
         | 
| 249 | 
            +
                "The Whisper feature extractor performs two operations:\n",
         | 
| 250 | 
            +
                "1. Pads / truncates the audio inputs to 30s: any audio inputs shorter than 30s are padded to 30s with silence (zeros), and those longer that 30s are truncated to 30s\n",
         | 
| 251 | 
            +
                "2. Converts the audio inputs to _log-Mel spectrogram_ input features, a visual representation of the audio and the form of the input expected by the Whisper model"
         | 
| 252 | 
            +
               ]
         | 
| 253 | 
            +
              },
         | 
| 254 | 
            +
              {
         | 
| 255 | 
            +
               "cell_type": "markdown",
         | 
| 256 | 
            +
               "id": "589d9ec1-d12b-4b64-93f7-04c63997da19",
         | 
| 257 | 
            +
               "metadata": {
         | 
| 258 | 
            +
                "id": "589d9ec1-d12b-4b64-93f7-04c63997da19"
         | 
| 259 | 
            +
               },
         | 
| 260 | 
            +
               "source": [
         | 
| 261 | 
            +
                "<figure>\n",
         | 
| 262 | 
            +
                "<img src=\"https://raw.githubusercontent.com/sanchit-gandhi/notebooks/main/spectrogram.jpg\" alt=\"Trulli\" style=\"width:100%\">\n",
         | 
| 263 | 
            +
                "<figcaption align = \"center\"><b>Figure 2:</b> Conversion of sampled audio array to log-Mel spectrogram.\n",
         | 
| 264 | 
            +
                "Left: sampled 1-dimensional audio signal. Right: corresponding log-Mel spectrogram. Figure source:\n",
         | 
| 265 | 
            +
                "<a href=\"https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html\">Google SpecAugment Blog</a>.\n",
         | 
| 266 | 
            +
                "</figcaption>"
         | 
| 267 | 
            +
               ]
         | 
| 268 | 
            +
              },
         | 
| 269 | 
            +
              {
         | 
| 270 | 
            +
               "cell_type": "markdown",
         | 
| 271 | 
            +
               "id": "b2ef54d5-b946-4c1d-9fdc-adc5d01b46aa",
         | 
| 272 | 
            +
               "metadata": {
         | 
| 273 | 
            +
                "id": "b2ef54d5-b946-4c1d-9fdc-adc5d01b46aa"
         | 
| 274 | 
            +
               },
         | 
| 275 | 
            +
               "source": [
         | 
| 276 | 
            +
                "We'll load the feature extractor from the pre-trained checkpoint with the default values:"
         | 
| 277 | 
            +
               ]
         | 
| 278 | 
            +
              },
         | 
| 279 | 
            +
              {
         | 
| 280 | 
            +
               "cell_type": "code",
         | 
| 281 | 
            +
               "execution_count": null,
         | 
| 282 | 
            +
               "id": "bc77d7bb-f9e2-47f5-b663-30f7a4321ce5",
         | 
| 283 | 
            +
               "metadata": {
         | 
| 284 | 
            +
                "id": "bc77d7bb-f9e2-47f5-b663-30f7a4321ce5"
         | 
| 285 | 
            +
               },
         | 
| 286 | 
            +
               "outputs": [],
         | 
| 287 | 
            +
               "source": [
         | 
| 288 | 
            +
                "from transformers import WhisperFeatureExtractor\n",
         | 
| 289 | 
            +
                "\n",
         | 
| 290 | 
            +
                "feature_extractor = WhisperFeatureExtractor.from_pretrained(\"openai/whisper-small\")"
         | 
| 291 | 
            +
               ]
         | 
| 292 | 
            +
              },
         | 
| 293 | 
            +
              {
         | 
| 294 | 
            +
               "cell_type": "markdown",
         | 
| 295 | 
            +
               "id": "93748af7-b917-4ecf-a0c8-7d89077ff9cb",
         | 
| 296 | 
            +
               "metadata": {
         | 
| 297 | 
            +
                "id": "93748af7-b917-4ecf-a0c8-7d89077ff9cb"
         | 
| 298 | 
            +
               },
         | 
| 299 | 
            +
               "source": [
         | 
| 300 | 
            +
                "### Load WhisperTokenizer"
         | 
| 301 | 
            +
               ]
         | 
| 302 | 
            +
              },
         | 
| 303 | 
            +
              {
         | 
| 304 | 
            +
               "cell_type": "markdown",
         | 
| 305 | 
            +
               "id": "2bc82609-a9fb-447a-a2af-99597c864029",
         | 
| 306 | 
            +
               "metadata": {
         | 
| 307 | 
            +
                "id": "2bc82609-a9fb-447a-a2af-99597c864029"
         | 
| 308 | 
            +
               },
         | 
| 309 | 
            +
               "source": [
         | 
| 310 | 
            +
                "The Whisper model outputs a sequence of _token ids_. The tokenizer maps each of these token ids to their corresponding text string. For Hindi, we can load the pre-trained tokenizer and use it for fine-tuning without any further modifications. We simply have to \n",
         | 
| 311 | 
            +
                "specify the target language and the task. These arguments inform the \n",
         | 
| 312 | 
            +
                "tokenizer to prefix the language and task tokens to the start of encoded \n",
         | 
| 313 | 
            +
                "label sequences:"
         | 
| 314 | 
            +
               ]
         | 
| 315 | 
            +
              },
         | 
| 316 | 
            +
              {
         | 
| 317 | 
            +
               "cell_type": "code",
         | 
| 318 | 
            +
               "execution_count": null,
         | 
| 319 | 
            +
               "id": "c7b07f9b-ae0e-4f89-98f0-0c50d432eab6",
         | 
| 320 | 
            +
               "metadata": {
         | 
| 321 | 
            +
                "id": "c7b07f9b-ae0e-4f89-98f0-0c50d432eab6",
         | 
| 322 | 
            +
                "outputId": "5c004b44-86e7-4e00-88be-39e0af5eed69"
         | 
| 323 | 
            +
               },
         | 
| 324 | 
            +
               "outputs": [
         | 
| 325 | 
            +
                {
         | 
| 326 | 
            +
                 "data": {
         | 
| 327 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 328 | 
            +
                   "model_id": "90d056e20b3e4f14ae0199a1a4ab1bb0",
         | 
| 329 | 
            +
                   "version_major": 2,
         | 
| 330 | 
            +
                   "version_minor": 0
         | 
| 331 | 
            +
                  },
         | 
| 332 | 
            +
                  "text/plain": [
         | 
| 333 | 
            +
                   "Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]"
         | 
| 334 | 
            +
                  ]
         | 
| 335 | 
            +
                 },
         | 
| 336 | 
            +
                 "metadata": {},
         | 
| 337 | 
            +
                 "output_type": "display_data"
         | 
| 338 | 
            +
                },
         | 
| 339 | 
            +
                {
         | 
| 340 | 
            +
                 "data": {
         | 
| 341 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 342 | 
            +
                   "model_id": "d82a88daec0e4f14add691b7b903064c",
         | 
| 343 | 
            +
                   "version_major": 2,
         | 
| 344 | 
            +
                   "version_minor": 0
         | 
| 345 | 
            +
                  },
         | 
| 346 | 
            +
                  "text/plain": [
         | 
| 347 | 
            +
                   "Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
         | 
| 348 | 
            +
                  ]
         | 
| 349 | 
            +
                 },
         | 
| 350 | 
            +
                 "metadata": {},
         | 
| 351 | 
            +
                 "output_type": "display_data"
         | 
| 352 | 
            +
                },
         | 
| 353 | 
            +
                {
         | 
| 354 | 
            +
                 "data": {
         | 
| 355 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 356 | 
            +
                   "model_id": "350acdb0f40e454099fa901e66de55f0",
         | 
| 357 | 
            +
                   "version_major": 2,
         | 
| 358 | 
            +
                   "version_minor": 0
         | 
| 359 | 
            +
                  },
         | 
| 360 | 
            +
                  "text/plain": [
         | 
| 361 | 
            +
                   "Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]"
         | 
| 362 | 
            +
                  ]
         | 
| 363 | 
            +
                 },
         | 
| 364 | 
            +
                 "metadata": {},
         | 
| 365 | 
            +
                 "output_type": "display_data"
         | 
| 366 | 
            +
                },
         | 
| 367 | 
            +
                {
         | 
| 368 | 
            +
                 "data": {
         | 
| 369 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 370 | 
            +
                   "model_id": "2e6a82a462cc411d90fa1bea4ee60790",
         | 
| 371 | 
            +
                   "version_major": 2,
         | 
| 372 | 
            +
                   "version_minor": 0
         | 
| 373 | 
            +
                  },
         | 
| 374 | 
            +
                  "text/plain": [
         | 
| 375 | 
            +
                   "Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]"
         | 
| 376 | 
            +
                  ]
         | 
| 377 | 
            +
                 },
         | 
| 378 | 
            +
                 "metadata": {},
         | 
| 379 | 
            +
                 "output_type": "display_data"
         | 
| 380 | 
            +
                },
         | 
| 381 | 
            +
                {
         | 
| 382 | 
            +
                 "data": {
         | 
| 383 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 384 | 
            +
                   "model_id": "c74bfee0198b4817832ea86e8e88d96c",
         | 
| 385 | 
            +
                   "version_major": 2,
         | 
| 386 | 
            +
                   "version_minor": 0
         | 
| 387 | 
            +
                  },
         | 
| 388 | 
            +
                  "text/plain": [
         | 
| 389 | 
            +
                   "Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]"
         | 
| 390 | 
            +
                  ]
         | 
| 391 | 
            +
                 },
         | 
| 392 | 
            +
                 "metadata": {},
         | 
| 393 | 
            +
                 "output_type": "display_data"
         | 
| 394 | 
            +
                },
         | 
| 395 | 
            +
                {
         | 
| 396 | 
            +
                 "data": {
         | 
| 397 | 
            +
                  "application/vnd.jupyter.widget-view+json": {
         | 
| 398 | 
            +
                   "model_id": "04fb2d81eff646068e10475a08ae42f4",
         | 
| 399 | 
            +
                   "version_major": 2,
         | 
| 400 | 
            +
                   "version_minor": 0
         | 
| 401 | 
            +
                  },
         | 
| 402 | 
            +
                  "text/plain": [
         | 
| 403 | 
            +
                   "Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]"
         | 
| 404 | 
            +
                  ]
         | 
| 405 | 
            +
                 },
         | 
| 406 | 
            +
                 "metadata": {},
         | 
| 407 | 
            +
                 "output_type": "display_data"
         | 
| 408 | 
            +
                }
         | 
| 409 | 
            +
               ],
         | 
| 410 | 
            +
               "source": [
         | 
| 411 | 
            +
                "from transformers import WhisperTokenizer\n",
         | 
| 412 | 
            +
                "\n",
         | 
| 413 | 
            +
                "tokenizer = WhisperTokenizer.from_pretrained(\"openai/whisper-small\", language=\"Hindi\", task=\"transcribe\")"
         | 
| 414 | 
            +
               ]
         | 
| 415 | 
            +
              },
         | 
| 416 | 
            +
              {
         | 
| 417 | 
            +
               "cell_type": "markdown",
         | 
| 418 | 
            +
               "id": "d2ef23f3-f4a8-483a-a2dc-080a7496cb1b",
         | 
| 419 | 
            +
               "metadata": {
         | 
| 420 | 
            +
                "id": "d2ef23f3-f4a8-483a-a2dc-080a7496cb1b"
         | 
| 421 | 
            +
               },
         | 
| 422 | 
            +
               "source": [
         | 
| 423 | 
            +
                "### Combine To Create A WhisperProcessor"
         | 
| 424 | 
            +
               ]
         | 
| 425 | 
            +
              },
         | 
| 426 | 
            +
              {
         | 
| 427 | 
            +
               "cell_type": "markdown",
         | 
| 428 | 
            +
               "id": "5ff67654-5a29-4bb8-a69d-0228946c6f8d",
         | 
| 429 | 
            +
               "metadata": {
         | 
| 430 | 
            +
                "id": "5ff67654-5a29-4bb8-a69d-0228946c6f8d"
         | 
| 431 | 
            +
               },
         | 
| 432 | 
            +
               "source": [
         | 
| 433 | 
            +
                "To simplify using the feature extractor and tokenizer, we can _wrap_ \n",
         | 
| 434 | 
            +
                "both into a single `WhisperProcessor` class. This processor object \n",
         | 
| 435 | 
            +
                "inherits from the `WhisperFeatureExtractor` and `WhisperProcessor`, \n",
         | 
| 436 | 
            +
                "and can be used on the audio inputs and model predictions as required. \n",
         | 
| 437 | 
            +
                "In doing so, we only need to keep track of two objects during training: \n",
         | 
| 438 | 
            +
                "the `processor` and the `model`:"
         | 
| 439 | 
            +
               ]
         | 
| 440 | 
            +
              },
         | 
| 441 | 
            +
              {
         | 
| 442 | 
            +
               "cell_type": "code",
         | 
| 443 | 
            +
               "execution_count": null,
         | 
| 444 | 
            +
               "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
         | 
| 445 | 
            +
               "metadata": {
         | 
| 446 | 
            +
                "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6"
         | 
| 447 | 
            +
               },
         | 
| 448 | 
            +
               "outputs": [],
         | 
| 449 | 
            +
               "source": [
         | 
| 450 | 
            +
                "from transformers import WhisperProcessor\n",
         | 
| 451 | 
            +
                "\n",
         | 
| 452 | 
            +
                "processor = WhisperProcessor.from_pretrained(\"openai/whisper-small\", language=\"Hindi\", task=\"transcribe\")"
         | 
| 453 | 
            +
               ]
         | 
| 454 | 
            +
              },
         | 
| 455 | 
            +
              {
         | 
| 456 | 
            +
               "cell_type": "markdown",
         | 
| 457 | 
            +
               "id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c",
         | 
| 458 | 
            +
               "metadata": {
         | 
| 459 | 
            +
                "id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c"
         | 
| 460 | 
            +
               },
         | 
| 461 | 
            +
               "source": [
         | 
| 462 | 
            +
                "### Prepare Data"
         | 
| 463 | 
            +
               ]
         | 
| 464 | 
            +
              },
         | 
| 465 | 
            +
              {
         | 
| 466 | 
            +
               "cell_type": "markdown",
         | 
| 467 | 
            +
               "id": "9649bf01-2e8a-45e5-8fca-441c13637b8f",
         | 
| 468 | 
            +
               "metadata": {
         | 
| 469 | 
            +
                "id": "9649bf01-2e8a-45e5-8fca-441c13637b8f"
         | 
| 470 | 
            +
               },
         | 
| 471 | 
            +
               "source": [
         | 
| 472 | 
            +
                "Let's print the first example of the Common Voice dataset to see \n",
         | 
| 473 | 
            +
                "what form the data is in:"
         | 
| 474 | 
            +
               ]
         | 
| 475 | 
            +
              },
         | 
| 476 | 
            +
              {
         | 
| 477 | 
            +
               "cell_type": "code",
         | 
| 478 | 
            +
               "execution_count": null,
         | 
| 479 | 
            +
               "id": "6e6b0ec5-0c94-4e2c-ae24-c791be1b2255",
         | 
| 480 | 
            +
               "metadata": {
         | 
| 481 | 
            +
                "id": "6e6b0ec5-0c94-4e2c-ae24-c791be1b2255"
         | 
| 482 | 
            +
               },
         | 
| 483 | 
            +
               "outputs": [],
         | 
| 484 | 
            +
               "source": [
         | 
| 485 | 
            +
                "print(common_voice[\"train\"][0])"
         | 
| 486 | 
            +
               ]
         | 
| 487 | 
            +
              },
         | 
| 488 | 
            +
              {
         | 
| 489 | 
            +
               "cell_type": "markdown",
         | 
| 490 | 
            +
               "id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd",
         | 
| 491 | 
            +
               "metadata": {
         | 
| 492 | 
            +
                "id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd"
         | 
| 493 | 
            +
               },
         | 
| 494 | 
            +
               "source": [
         | 
| 495 | 
            +
                "Since \n",
         | 
| 496 | 
            +
                "our input audio is sampled at 48kHz, we need to _downsample_ it to \n",
         | 
| 497 | 
            +
                "16kHz prior to passing it to the Whisper feature extractor, 16kHz being the sampling rate expected by the Whisper model. \n",
         | 
| 498 | 
            +
                "\n",
         | 
| 499 | 
            +
                "We'll set the audio inputs to the correct sampling rate using dataset's \n",
         | 
| 500 | 
            +
                "[`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=cast_column#datasets.DatasetDict.cast_column)\n",
         | 
| 501 | 
            +
                "method. This operation does not change the audio in-place, \n",
         | 
| 502 | 
            +
                "but rather signals to `datasets` to resample audio samples _on the fly_ the \n",
         | 
| 503 | 
            +
                "first time that they are loaded:"
         | 
| 504 | 
            +
               ]
         | 
| 505 | 
            +
              },
         | 
| 506 | 
            +
              {
         | 
| 507 | 
            +
               "cell_type": "code",
         | 
| 508 | 
            +
               "execution_count": null,
         | 
| 509 | 
            +
               "id": "f12e2e57-156f-417b-8cfb-69221cc198e8",
         | 
| 510 | 
            +
               "metadata": {
         | 
| 511 | 
            +
                "id": "f12e2e57-156f-417b-8cfb-69221cc198e8"
         | 
| 512 | 
            +
               },
         | 
| 513 | 
            +
               "outputs": [],
         | 
| 514 | 
            +
               "source": [
         | 
| 515 | 
            +
                "from datasets import Audio\n",
         | 
| 516 | 
            +
                "\n",
         | 
| 517 | 
            +
                "common_voice = common_voice.cast_column(\"audio\", Audio(sampling_rate=16000))"
         | 
| 518 | 
            +
               ]
         | 
| 519 | 
            +
              },
         | 
| 520 | 
            +
              {
         | 
| 521 | 
            +
               "cell_type": "markdown",
         | 
| 522 | 
            +
               "id": "00382a3e-abec-4cdd-a54c-d1aaa3ea4707",
         | 
| 523 | 
            +
               "metadata": {
         | 
| 524 | 
            +
                "id": "00382a3e-abec-4cdd-a54c-d1aaa3ea4707"
         | 
| 525 | 
            +
               },
         | 
| 526 | 
            +
               "source": [
         | 
| 527 | 
            +
                "Re-loading the first audio sample in the Common Voice dataset will resample \n",
         | 
| 528 | 
            +
                "it to the desired sampling rate:"
         | 
| 529 | 
            +
               ]
         | 
| 530 | 
            +
              },
         | 
| 531 | 
            +
              {
         | 
| 532 | 
            +
               "cell_type": "code",
         | 
| 533 | 
            +
               "execution_count": null,
         | 
| 534 | 
            +
               "id": "87122d71-289a-466a-afcf-fa354b18946b",
         | 
| 535 | 
            +
               "metadata": {
         | 
| 536 | 
            +
                "id": "87122d71-289a-466a-afcf-fa354b18946b"
         | 
| 537 | 
            +
               },
         | 
| 538 | 
            +
               "outputs": [],
         | 
| 539 | 
            +
               "source": [
         | 
| 540 | 
            +
                "print(common_voice[\"train\"][0])"
         | 
| 541 | 
            +
               ]
         | 
| 542 | 
            +
              },
         | 
| 543 | 
            +
              {
         | 
| 544 | 
            +
               "cell_type": "markdown",
         | 
| 545 | 
            +
               "id": "3df7378a-a4c0-45d7-8d07-defbd1062ab6",
         | 
| 546 | 
            +
               "metadata": {},
         | 
| 547 | 
            +
               "source": [
         | 
| 548 | 
            +
                "We'll define our pre-processing strategy. We advise that you **do not** lower-case the transcriptions or remove punctuation unless mixing different datasets. This will enable you to fine-tune Whisper models that can predict punctuation and casing. Later, you will see how we can evaluate the predictions without punctuation or casing, so that the models benefit from the WER improvement obtained by normalising the transcriptions while still predicting fully formatted transcriptions."
         | 
| 549 | 
            +
               ]
         | 
| 550 | 
            +
              },
         | 
| 551 | 
            +
              {
         | 
| 552 | 
            +
               "cell_type": "code",
         | 
| 553 | 
            +
               "execution_count": null,
         | 
| 554 | 
            +
               "id": "d041650e-1c48-4439-87b3-5b6f4a514107",
         | 
| 555 | 
            +
               "metadata": {},
         | 
| 556 | 
            +
               "outputs": [],
         | 
| 557 | 
            +
               "source": [
         | 
| 558 | 
            +
                "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
         | 
| 559 | 
            +
                "\n",
         | 
| 560 | 
            +
                "do_lower_case = False\n",
         | 
| 561 | 
            +
                "do_remove_punctuation = False\n",
         | 
| 562 | 
            +
                "\n",
         | 
| 563 | 
            +
                "normalizer = BasicTextNormalizer()"
         | 
| 564 | 
            +
               ]
         | 
| 565 | 
            +
              },
         | 
| 566 | 
            +
              {
         | 
| 567 | 
            +
               "cell_type": "markdown",
         | 
| 568 | 
            +
               "id": "89e12c2e-2f14-479b-987b-f0c75c881095",
         | 
| 569 | 
            +
               "metadata": {},
         | 
| 570 | 
            +
               "source": [
         | 
| 571 | 
            +
                "Now we can write a function to prepare our data ready for the model:\n",
         | 
| 572 | 
            +
                "1. We load and resample the audio data by calling `batch[\"audio\"]`. As explained above, 🤗 Datasets performs any necessary resampling operations on the fly.\n",
         | 
| 573 | 
            +
                "2. We use the feature extractor to compute the log-Mel spectrogram input features from our 1-dimensional audio array.\n",
         | 
| 574 | 
            +
                "3. We perform any optional pre-processing (lower-case or remove punctuation).\n",
         | 
| 575 | 
            +
                "4. We encode the transcriptions to label ids through the use of the tokenizer."
         | 
| 576 | 
            +
               ]
         | 
| 577 | 
            +
              },
         | 
| 578 | 
            +
              {
         | 
| 579 | 
            +
               "cell_type": "code",
         | 
| 580 | 
            +
               "execution_count": null,
         | 
| 581 | 
            +
               "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
         | 
| 582 | 
            +
               "metadata": {},
         | 
| 583 | 
            +
               "outputs": [],
         | 
| 584 | 
            +
               "source": [
         | 
| 585 | 
            +
                "def prepare_dataset(batch):\n",
         | 
| 586 | 
            +
                "    # load and (possibly) resample audio data to 16kHz\n",
         | 
| 587 | 
            +
                "    audio = batch[\"audio\"]\n",
         | 
| 588 | 
            +
                "\n",
         | 
| 589 | 
            +
                "    # compute log-Mel input features from input audio array \n",
         | 
| 590 | 
            +
                "    batch[\"input_features\"] = processor.feature_extractor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_features[0]\n",
         | 
| 591 | 
            +
                "    # compute input length of audio sample in seconds\n",
         | 
| 592 | 
            +
                "    batch[\"input_length\"] = len(audio[\"array\"]) / audio[\"sampling_rate\"]\n",
         | 
| 593 | 
            +
                "    \n",
         | 
| 594 | 
            +
                "    # optional pre-processing steps\n",
         | 
| 595 | 
            +
                "    transcription = batch[\"sentence\"]\n",
         | 
| 596 | 
            +
                "    if do_lower_case:\n",
         | 
| 597 | 
            +
                "        transcription = transcription.lower()\n",
         | 
| 598 | 
            +
                "    if do_remove_punctuation:\n",
         | 
| 599 | 
            +
                "        transcription = normalizer(transcription).strip()\n",
         | 
| 600 | 
            +
                "    \n",
         | 
| 601 | 
            +
                "    # encode target text to label ids\n",
         | 
| 602 | 
            +
                "    batch[\"labels\"] = processor.tokenizer(transcription).input_ids\n",
         | 
| 603 | 
            +
                "    return batch"
         | 
| 604 | 
            +
               ]
         | 
| 605 | 
            +
              },
         | 
| 606 | 
            +
              {
         | 
| 607 | 
            +
               "cell_type": "markdown",
         | 
| 608 | 
            +
               "id": "8c960965-9fb6-466f-9dbd-c9d43e71d9d0",
         | 
| 609 | 
            +
               "metadata": {
         | 
| 610 | 
            +
                "id": "70b319fb-2439-4ef6-a70d-a47bf41c4a13"
         | 
| 611 | 
            +
               },
         | 
| 612 | 
            +
               "source": [
         | 
| 613 | 
            +
                "We can apply the data preparation function to all of our training examples using dataset's `.map` method. The argument `num_proc` specifies how many CPU cores to use. Setting `num_proc` > 1 will enable multiprocessing. If the `.map` method hangs with multiprocessing, set `num_proc=1` and process the dataset sequentially."
         | 
| 614 | 
            +
               ]
         | 
| 615 | 
            +
              },
         | 
| 616 | 
            +
              {
         | 
| 617 | 
            +
               "cell_type": "code",
         | 
| 618 | 
            +
               "execution_count": null,
         | 
| 619 | 
            +
               "id": "7b73ab39-ffaf-4b9e-86e5-782963c6134b",
         | 
| 620 | 
            +
               "metadata": {
         | 
| 621 | 
            +
                "id": "7b73ab39-ffaf-4b9e-86e5-782963c6134b"
         | 
| 622 | 
            +
               },
         | 
| 623 | 
            +
               "outputs": [],
         | 
| 624 | 
            +
               "source": [
         | 
| 625 | 
            +
                "common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names[\"train\"], num_proc=2)"
         | 
| 626 | 
            +
               ]
         | 
| 627 | 
            +
              },
         | 
| 628 | 
            +
              {
         | 
| 629 | 
            +
               "cell_type": "markdown",
         | 
| 630 | 
            +
               "id": "54ce0fdb-7218-4a4d-b175-383980fec0df",
         | 
| 631 | 
            +
               "metadata": {},
         | 
| 632 | 
            +
               "source": [
         | 
| 633 | 
            +
                "Finally, we filter any training data with audio samples longer than 30s. These samples would otherwise be truncated by the Whisper feature-extractor which could affect the stability of training. We define a function that returns `True` for samples that are less than 30s, and `False` for those that are longer:"
         | 
| 634 | 
            +
               ]
         | 
| 635 | 
            +
              },
         | 
| 636 | 
            +
              {
         | 
| 637 | 
            +
               "cell_type": "code",
         | 
| 638 | 
            +
               "execution_count": null,
         | 
| 639 | 
            +
               "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
         | 
| 640 | 
            +
               "metadata": {},
         | 
| 641 | 
            +
               "outputs": [],
         | 
| 642 | 
            +
               "source": [
         | 
| 643 | 
            +
                "max_input_length = 30.0\n",
         | 
| 644 | 
            +
                "\n",
         | 
| 645 | 
            +
                "def is_audio_in_length_range(length):\n",
         | 
| 646 | 
            +
                "    return length < max_input_length"
         | 
| 647 | 
            +
               ]
         | 
| 648 | 
            +
              },
         | 
| 649 | 
            +
              {
         | 
| 650 | 
            +
               "cell_type": "markdown",
         | 
| 651 | 
            +
               "id": "30e676a8-7ca8-4850-8c5d-5b2b00d13fba",
         | 
| 652 | 
            +
               "metadata": {},
         | 
| 653 | 
            +
               "source": [
         | 
| 654 | 
            +
                "We apply our filter function to all samples of our training dataset through 🤗 Datasets' `.filter` method:"
         | 
| 655 | 
            +
               ]
         | 
| 656 | 
            +
              },
         | 
| 657 | 
            +
              {
         | 
| 658 | 
            +
               "cell_type": "code",
         | 
| 659 | 
            +
               "execution_count": null,
         | 
| 660 | 
            +
               "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
         | 
| 661 | 
            +
               "metadata": {},
         | 
| 662 | 
            +
               "outputs": [],
         | 
| 663 | 
            +
               "source": [
         | 
| 664 | 
            +
                "common_voice[\"train\"] = common_voice[\"train\"].filter(\n",
         | 
| 665 | 
            +
                "    is_audio_in_length_range,\n",
         | 
| 666 | 
            +
                "    input_columns=[\"input_length\"],\n",
         | 
| 667 | 
            +
                ")"
         | 
| 668 | 
            +
               ]
         | 
| 669 | 
            +
              },
         | 
| 670 | 
            +
              {
         | 
| 671 | 
            +
               "cell_type": "markdown",
         | 
| 672 | 
            +
               "id": "263a5a58-0239-4a25-b0df-c625fc9c5810",
         | 
| 673 | 
            +
               "metadata": {
         | 
| 674 | 
            +
                "id": "263a5a58-0239-4a25-b0df-c625fc9c5810"
         | 
| 675 | 
            +
               },
         | 
| 676 | 
            +
               "source": [
         | 
| 677 | 
            +
                "## Training and Evaluation"
         | 
| 678 | 
            +
               ]
         | 
| 679 | 
            +
              },
         | 
| 680 | 
            +
              {
         | 
| 681 | 
            +
               "cell_type": "markdown",
         | 
| 682 | 
            +
               "id": "a693e768-c5a6-453f-89a1-b601dcf7daf7",
         | 
| 683 | 
            +
               "metadata": {
         | 
| 684 | 
            +
                "id": "a693e768-c5a6-453f-89a1-b601dcf7daf7"
         | 
| 685 | 
            +
               },
         | 
| 686 | 
            +
               "source": [
         | 
| 687 | 
            +
                "Now that we've prepared our data, we're ready to dive into the training pipeline. \n",
         | 
| 688 | 
            +
                "The [🤗 Trainer](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer)\n",
         | 
| 689 | 
            +
                "will do much of the heavy lifting for us. All we have to do is:\n",
         | 
| 690 | 
            +
                "\n",
         | 
| 691 | 
            +
                "- Define a data collator: the data collator takes our pre-processed data and prepares PyTorch tensors ready for the model.\n",
         | 
| 692 | 
            +
                "\n",
         | 
| 693 | 
            +
                "- Evaluation metrics: during evaluation, we want to evaluate the model using the [word error rate (WER)](https://huggingface.co/metrics/wer) metric. We need to define a `compute_metrics` function that handles this computation.\n",
         | 
| 694 | 
            +
                "\n",
         | 
| 695 | 
            +
                "- Load a pre-trained checkpoint: we need to load a pre-trained checkpoint and configure it correctly for training.\n",
         | 
| 696 | 
            +
                "\n",
         | 
| 697 | 
            +
                "- Define the training configuration: this will be used by the 🤗 Trainer to define the training schedule.\n",
         | 
| 698 | 
            +
                "\n",
         | 
| 699 | 
            +
                "Once we've fine-tuned the model, we will evaluate it on the test data to verify that we have correctly trained it \n",
         | 
| 700 | 
            +
                "to transcribe speech in Hindi."
         | 
| 701 | 
            +
               ]
         | 
| 702 | 
            +
              },
         | 
| 703 | 
            +
              {
         | 
| 704 | 
            +
               "cell_type": "markdown",
         | 
| 705 | 
            +
               "id": "8d230e6d-624c-400a-bbf5-fa660881df25",
         | 
| 706 | 
            +
               "metadata": {
         | 
| 707 | 
            +
                "id": "8d230e6d-624c-400a-bbf5-fa660881df25"
         | 
| 708 | 
            +
               },
         | 
| 709 | 
            +
               "source": [
         | 
| 710 | 
            +
                "### Define a Data Collator"
         | 
| 711 | 
            +
               ]
         | 
| 712 | 
            +
              },
         | 
| 713 | 
            +
              {
         | 
| 714 | 
            +
               "cell_type": "markdown",
         | 
| 715 | 
            +
               "id": "04def221-0637-4a69-b242-d3f0c1d0ee78",
         | 
| 716 | 
            +
               "metadata": {
         | 
| 717 | 
            +
                "id": "04def221-0637-4a69-b242-d3f0c1d0ee78"
         | 
| 718 | 
            +
               },
         | 
| 719 | 
            +
               "source": [
         | 
| 720 | 
            +
                "The data collator for a sequence-to-sequence speech model is unique in the sense that it \n",
         | 
| 721 | 
            +
                "treats the `input_features` and `labels` independently: the  `input_features` must be \n",
         | 
| 722 | 
            +
                "handled by the feature extractor and the `labels` by the tokenizer.\n",
         | 
| 723 | 
            +
                "\n",
         | 
| 724 | 
            +
                "The `input_features` are already padded to 30s and converted to a log-Mel spectrogram \n",
         | 
| 725 | 
            +
                "of fixed dimension by action of the feature extractor, so all we have to do is convert the `input_features`\n",
         | 
| 726 | 
            +
                "to batched PyTorch tensors. We do this using the feature extractor's `.pad` method with `return_tensors=pt`.\n",
         | 
| 727 | 
            +
                "\n",
         | 
| 728 | 
            +
                "The `labels` on the other hand are un-padded. We first pad the sequences\n",
         | 
| 729 | 
            +
                "to the maximum length in the batch using the tokenizer's `.pad` method. The padding tokens \n",
         | 
| 730 | 
            +
                "are then replaced by `-100` so that these tokens are **not** taken into account when \n",
         | 
| 731 | 
            +
                "computing the loss. We then cut the BOS token from the start of the label sequence as we \n",
         | 
| 732 | 
            +
                "append it later during training.\n",
         | 
| 733 | 
            +
                "\n",
         | 
| 734 | 
            +
                "We can leverage the `WhisperProcessor` we defined earlier to perform both the \n",
         | 
| 735 | 
            +
                "feature extractor and the tokenizer operations:"
         | 
| 736 | 
            +
               ]
         | 
| 737 | 
            +
              },
         | 
| 738 | 
            +
              {
         | 
| 739 | 
            +
               "cell_type": "code",
         | 
| 740 | 
            +
               "execution_count": null,
         | 
| 741 | 
            +
               "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5",
         | 
| 742 | 
            +
               "metadata": {
         | 
| 743 | 
            +
                "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5"
         | 
| 744 | 
            +
               },
         | 
| 745 | 
            +
               "outputs": [],
         | 
| 746 | 
            +
               "source": [
         | 
| 747 | 
            +
                "import torch\n",
         | 
| 748 | 
            +
                "\n",
         | 
| 749 | 
            +
                "from dataclasses import dataclass\n",
         | 
| 750 | 
            +
                "from typing import Any, Dict, List, Union\n",
         | 
| 751 | 
            +
                "\n",
         | 
| 752 | 
            +
                "@dataclass\n",
         | 
| 753 | 
            +
                "class DataCollatorSpeechSeq2SeqWithPadding:\n",
         | 
| 754 | 
            +
                "    processor: Any\n",
         | 
| 755 | 
            +
                "\n",
         | 
| 756 | 
            +
                "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
         | 
| 757 | 
            +
                "        # split inputs and labels since they have to be of different lengths and need different padding methods\n",
         | 
| 758 | 
            +
                "        # first treat the audio inputs by simply returning torch tensors\n",
         | 
| 759 | 
            +
                "        input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
         | 
| 760 | 
            +
                "        batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
         | 
| 761 | 
            +
                "\n",
         | 
| 762 | 
            +
                "        # get the tokenized label sequences\n",
         | 
| 763 | 
            +
                "        label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
         | 
| 764 | 
            +
                "        # pad the labels to max length\n",
         | 
| 765 | 
            +
                "        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
         | 
| 766 | 
            +
                "\n",
         | 
| 767 | 
            +
                "        # replace padding with -100 to ignore loss correctly\n",
         | 
| 768 | 
            +
                "        labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
         | 
| 769 | 
            +
                "\n",
         | 
| 770 | 
            +
                "        # if bos token is appended in previous tokenization step,\n",
         | 
| 771 | 
            +
                "        # cut bos token here as it's append later anyways\n",
         | 
| 772 | 
            +
                "        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
         | 
| 773 | 
            +
                "            labels = labels[:, 1:]\n",
         | 
| 774 | 
            +
                "\n",
         | 
| 775 | 
            +
                "        batch[\"labels\"] = labels\n",
         | 
| 776 | 
            +
                "\n",
         | 
| 777 | 
            +
                "        return batch"
         | 
| 778 | 
            +
               ]
         | 
| 779 | 
            +
              },
         | 
| 780 | 
            +
              {
         | 
| 781 | 
            +
               "cell_type": "markdown",
         | 
| 782 | 
            +
               "id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86",
         | 
| 783 | 
            +
               "metadata": {
         | 
| 784 | 
            +
                "id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86"
         | 
| 785 | 
            +
               },
         | 
| 786 | 
            +
               "source": [
         | 
| 787 | 
            +
                "Let's initialise the data collator we've just defined:"
         | 
| 788 | 
            +
               ]
         | 
| 789 | 
            +
              },
         | 
| 790 | 
            +
              {
         | 
| 791 | 
            +
               "cell_type": "code",
         | 
| 792 | 
            +
               "execution_count": null,
         | 
| 793 | 
            +
               "id": "fc834702-c0d3-4a96-b101-7b87be32bf42",
         | 
| 794 | 
            +
               "metadata": {
         | 
| 795 | 
            +
                "id": "fc834702-c0d3-4a96-b101-7b87be32bf42"
         | 
| 796 | 
            +
               },
         | 
| 797 | 
            +
               "outputs": [],
         | 
| 798 | 
            +
               "source": [
         | 
| 799 | 
            +
                "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)"
         | 
| 800 | 
            +
               ]
         | 
| 801 | 
            +
              },
         | 
| 802 | 
            +
              {
         | 
| 803 | 
            +
               "cell_type": "markdown",
         | 
| 804 | 
            +
               "id": "d62bb2ab-750a-45e7-82e9-61d6f4805698",
         | 
| 805 | 
            +
               "metadata": {
         | 
| 806 | 
            +
                "id": "d62bb2ab-750a-45e7-82e9-61d6f4805698"
         | 
| 807 | 
            +
               },
         | 
| 808 | 
            +
               "source": [
         | 
| 809 | 
            +
                "### Evaluation Metrics"
         | 
| 810 | 
            +
               ]
         | 
| 811 | 
            +
              },
         | 
| 812 | 
            +
              {
         | 
| 813 | 
            +
               "cell_type": "markdown",
         | 
| 814 | 
            +
               "id": "66fee1a7-a44c-461e-b047-c3917221572e",
         | 
| 815 | 
            +
               "metadata": {
         | 
| 816 | 
            +
                "id": "66fee1a7-a44c-461e-b047-c3917221572e"
         | 
| 817 | 
            +
               },
         | 
| 818 | 
            +
               "source": [
         | 
| 819 | 
            +
                "We'll use the word error rate (WER) metric, the 'de-facto' metric for assessing \n",
         | 
| 820 | 
            +
                "ASR systems. For more information, refer to the WER [docs](https://huggingface.co/metrics/wer). We'll load the WER metric from 🤗 Evaluate:"
         | 
| 821 | 
            +
               ]
         | 
| 822 | 
            +
              },
         | 
| 823 | 
            +
              {
         | 
| 824 | 
            +
               "cell_type": "code",
         | 
| 825 | 
            +
               "execution_count": null,
         | 
| 826 | 
            +
               "id": "b22b4011-f31f-4b57-b684-c52332f92890",
         | 
| 827 | 
            +
               "metadata": {
         | 
| 828 | 
            +
                "id": "b22b4011-f31f-4b57-b684-c52332f92890"
         | 
| 829 | 
            +
               },
         | 
| 830 | 
            +
               "outputs": [],
         | 
| 831 | 
            +
               "source": [
         | 
| 832 | 
            +
                "import evaluate\n",
         | 
| 833 | 
            +
                "\n",
         | 
| 834 | 
            +
                "metric = evaluate.load(\"wer\")"
         | 
| 835 | 
            +
               ]
         | 
| 836 | 
            +
              },
         | 
| 837 | 
            +
              {
         | 
| 838 | 
            +
               "cell_type": "markdown",
         | 
| 839 | 
            +
               "id": "4f32cab6-31f0-4cb9-af4c-40ba0f5fc508",
         | 
| 840 | 
            +
               "metadata": {
         | 
| 841 | 
            +
                "id": "4f32cab6-31f0-4cb9-af4c-40ba0f5fc508"
         | 
| 842 | 
            +
               },
         | 
| 843 | 
            +
               "source": [
         | 
| 844 | 
            +
                "We then simply have to define a function that takes our model \n",
         | 
| 845 | 
            +
                "predictions and returns the WER metric. This function, called\n",
         | 
| 846 | 
            +
                "`compute_metrics`, first replaces `-100` with the `pad_token_id`\n",
         | 
| 847 | 
            +
                "in the `label_ids` (undoing the step we applied in the \n",
         | 
| 848 | 
            +
                "data collator to ignore padded tokens correctly in the loss).\n",
         | 
| 849 | 
            +
                "It then decodes the predicted and label ids to strings. Finally,\n",
         | 
| 850 | 
            +
                "it computes the WER between the predictions and reference labels. \n",
         | 
| 851 | 
            +
                "Here, we have the option of evaluating with the 'normalised' transcriptions \n",
         | 
| 852 | 
            +
                "and predictions. We recommend you set this to `True` to benefit from the WER \n",
         | 
| 853 | 
            +
                "improvement obtained by normalising the transcriptions."
         | 
| 854 | 
            +
               ]
         | 
| 855 | 
            +
              },
         | 
| 856 | 
            +
              {
         | 
| 857 | 
            +
               "cell_type": "code",
         | 
| 858 | 
            +
               "execution_count": null,
         | 
| 859 | 
            +
               "id": "23959a70-22d0-4ffe-9fa1-72b61e75bb52",
         | 
| 860 | 
            +
               "metadata": {
         | 
| 861 | 
            +
                "id": "23959a70-22d0-4ffe-9fa1-72b61e75bb52"
         | 
| 862 | 
            +
               },
         | 
| 863 | 
            +
               "outputs": [],
         | 
| 864 | 
            +
               "source": [
         | 
| 865 | 
            +
                "# evaluate with the 'normalised' WER\n",
         | 
| 866 | 
            +
                "do_normalize_eval = True\n",
         | 
| 867 | 
            +
                "\n",
         | 
| 868 | 
            +
                "def compute_metrics(pred):\n",
         | 
| 869 | 
            +
                "    pred_ids = pred.predictions\n",
         | 
| 870 | 
            +
                "    label_ids = pred.label_ids\n",
         | 
| 871 | 
            +
                "\n",
         | 
| 872 | 
            +
                "    # replace -100 with the pad_token_id\n",
         | 
| 873 | 
            +
                "    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n",
         | 
| 874 | 
            +
                "\n",
         | 
| 875 | 
            +
                "    # we do not want to group tokens when computing the metrics\n",
         | 
| 876 | 
            +
                "    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n",
         | 
| 877 | 
            +
                "    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n",
         | 
| 878 | 
            +
                "\n",
         | 
| 879 | 
            +
                "    if do_normalize_eval:\n",
         | 
| 880 | 
            +
                "        pred_str = [normalizer(pred) for pred in pred_str]\n",
         | 
| 881 | 
            +
                "        label_str = [normalizer(label) for label in label_str]\n",
         | 
| 882 | 
            +
                "\n",
         | 
| 883 | 
            +
                "    wer = 100 * metric.compute(predictions=pred_str, references=label_str)\n",
         | 
| 884 | 
            +
                "\n",
         | 
| 885 | 
            +
                "    return {\"wer\": wer}"
         | 
| 886 | 
            +
               ]
         | 
| 887 | 
            +
              },
         | 
| 888 | 
            +
              {
         | 
| 889 | 
            +
               "cell_type": "markdown",
         | 
| 890 | 
            +
               "id": "daf2a825-6d9f-4a23-b145-c37c0039075b",
         | 
| 891 | 
            +
               "metadata": {
         | 
| 892 | 
            +
                "id": "daf2a825-6d9f-4a23-b145-c37c0039075b"
         | 
| 893 | 
            +
               },
         | 
| 894 | 
            +
               "source": [
         | 
| 895 | 
            +
                "### Load a Pre-Trained Checkpoint"
         | 
| 896 | 
            +
               ]
         | 
| 897 | 
            +
              },
         | 
| 898 | 
            +
              {
         | 
| 899 | 
            +
               "cell_type": "markdown",
         | 
| 900 | 
            +
               "id": "437a97fa-4864-476b-8abc-f28b8166cfa5",
         | 
| 901 | 
            +
               "metadata": {
         | 
| 902 | 
            +
                "id": "437a97fa-4864-476b-8abc-f28b8166cfa5"
         | 
| 903 | 
            +
               },
         | 
| 904 | 
            +
               "source": [
         | 
| 905 | 
            +
                "Now let's load the pre-trained Whisper `small` checkpoint. Again, this \n",
         | 
| 906 | 
            +
                "is trivial through use of 🤗 Transformers!"
         | 
| 907 | 
            +
               ]
         | 
| 908 | 
            +
              },
         | 
| 909 | 
            +
              {
         | 
| 910 | 
            +
               "cell_type": "code",
         | 
| 911 | 
            +
               "execution_count": null,
         | 
| 912 | 
            +
               "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
         | 
| 913 | 
            +
               "metadata": {
         | 
| 914 | 
            +
                "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f"
         | 
| 915 | 
            +
               },
         | 
| 916 | 
            +
               "outputs": [],
         | 
| 917 | 
            +
               "source": [
         | 
| 918 | 
            +
                "from transformers import WhisperForConditionalGeneration\n",
         | 
| 919 | 
            +
                "\n",
         | 
| 920 | 
            +
                "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-small\")"
         | 
| 921 | 
            +
               ]
         | 
| 922 | 
            +
              },
         | 
| 923 | 
            +
              {
         | 
| 924 | 
            +
               "cell_type": "markdown",
         | 
| 925 | 
            +
               "id": "a15ead5f-2277-4a39-937b-585c2497b2df",
         | 
| 926 | 
            +
               "metadata": {
         | 
| 927 | 
            +
                "id": "a15ead5f-2277-4a39-937b-585c2497b2df"
         | 
| 928 | 
            +
               },
         | 
| 929 | 
            +
               "source": [
         | 
| 930 | 
            +
                "Override generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)). Set `use_cache` to False since we're using gradient checkpointing, and the two are incompatible:"
         | 
| 931 | 
            +
               ]
         | 
| 932 | 
            +
              },
         | 
| 933 | 
            +
              {
         | 
| 934 | 
            +
               "cell_type": "code",
         | 
| 935 | 
            +
               "execution_count": null,
         | 
| 936 | 
            +
               "id": "62038ba3-88ed-4fce-84db-338f50dcd04f",
         | 
| 937 | 
            +
               "metadata": {
         | 
| 938 | 
            +
                "id": "62038ba3-88ed-4fce-84db-338f50dcd04f"
         | 
| 939 | 
            +
               },
         | 
| 940 | 
            +
               "outputs": [],
         | 
| 941 | 
            +
               "source": [
         | 
| 942 | 
            +
                "model.config.forced_decoder_ids = None\n",
         | 
| 943 | 
            +
                "model.config.suppress_tokens = []\n",
         | 
| 944 | 
            +
                "model.config.use_cache = False"
         | 
| 945 | 
            +
               ]
         | 
| 946 | 
            +
              },
         | 
| 947 | 
            +
              {
         | 
| 948 | 
            +
               "cell_type": "markdown",
         | 
| 949 | 
            +
               "id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06",
         | 
| 950 | 
            +
               "metadata": {
         | 
| 951 | 
            +
                "id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06"
         | 
| 952 | 
            +
               },
         | 
| 953 | 
            +
               "source": [
         | 
| 954 | 
            +
                "### Define the Training Configuration"
         | 
| 955 | 
            +
               ]
         | 
| 956 | 
            +
              },
         | 
| 957 | 
            +
              {
         | 
| 958 | 
            +
               "cell_type": "markdown",
         | 
| 959 | 
            +
               "id": "c21af1e9-0188-4134-ac82-defc7bdcc436",
         | 
| 960 | 
            +
               "metadata": {
         | 
| 961 | 
            +
                "id": "c21af1e9-0188-4134-ac82-defc7bdcc436"
         | 
| 962 | 
            +
               },
         | 
| 963 | 
            +
               "source": [
         | 
| 964 | 
            +
                "In the final step, we define all the parameters related to training. For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments)."
         | 
| 965 | 
            +
               ]
         | 
| 966 | 
            +
              },
         | 
| 967 | 
            +
              {
         | 
| 968 | 
            +
               "cell_type": "code",
         | 
| 969 | 
            +
               "execution_count": null,
         | 
| 970 | 
            +
               "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a",
         | 
| 971 | 
            +
               "metadata": {
         | 
| 972 | 
            +
                "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a"
         | 
| 973 | 
            +
               },
         | 
| 974 | 
            +
               "outputs": [],
         | 
| 975 | 
            +
               "source": [
         | 
| 976 | 
            +
                "from transformers import Seq2SeqTrainingArguments\n",
         | 
| 977 | 
            +
                "\n",
         | 
| 978 | 
            +
                "training_args = Seq2SeqTrainingArguments(\n",
         | 
| 979 | 
            +
                "    output_dir=\"./\",\n",
         | 
| 980 | 
            +
                "    per_device_train_batch_size=64,\n",
         | 
| 981 | 
            +
                "    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n",
         | 
| 982 | 
            +
                "    learning_rate=1e-5,\n",
         | 
| 983 | 
            +
                "    warmup_steps=500,\n",
         | 
| 984 | 
            +
                "    max_steps=5000,\n",
         | 
| 985 | 
            +
                "    gradient_checkpointing=True,\n",
         | 
| 986 | 
            +
                "    fp16=True,\n",
         | 
| 987 | 
            +
                "    evaluation_strategy=\"steps\",\n",
         | 
| 988 | 
            +
                "    per_device_eval_batch_size=8,\n",
         | 
| 989 | 
            +
                "    predict_with_generate=True,\n",
         | 
| 990 | 
            +
                "    generation_max_length=225,\n",
         | 
| 991 | 
            +
                "    save_steps=1000,\n",
         | 
| 992 | 
            +
                "    eval_steps=1000,\n",
         | 
| 993 | 
            +
                "    logging_steps=25,\n",
         | 
| 994 | 
            +
                "    report_to=[\"tensorboard\"],\n",
         | 
| 995 | 
            +
                "    load_best_model_at_end=True,\n",
         | 
| 996 | 
            +
                "    metric_for_best_model=\"wer\",\n",
         | 
| 997 | 
            +
                "    greater_is_better=False,\n",
         | 
| 998 | 
            +
                "    push_to_hub=True,\n",
         | 
| 999 | 
            +
                ")"
         | 
| 1000 | 
            +
               ]
         | 
| 1001 | 
            +
              },
         | 
| 1002 | 
            +
              {
         | 
| 1003 | 
            +
               "cell_type": "markdown",
         | 
| 1004 | 
            +
               "id": "b3a944d8-3112-4552-82a0-be25988b3857",
         | 
| 1005 | 
            +
               "metadata": {
         | 
| 1006 | 
            +
                "id": "b3a944d8-3112-4552-82a0-be25988b3857"
         | 
| 1007 | 
            +
               },
         | 
| 1008 | 
            +
               "source": [
         | 
| 1009 | 
            +
                "**Note**: if one does not want to upload the model checkpoints to the Hub, \n",
         | 
| 1010 | 
            +
                "set `push_to_hub=False`."
         | 
| 1011 | 
            +
               ]
         | 
| 1012 | 
            +
              },
         | 
| 1013 | 
            +
              {
         | 
| 1014 | 
            +
               "cell_type": "markdown",
         | 
| 1015 | 
            +
               "id": "bac29114-d226-4f54-97cf-8718c9f94e1e",
         | 
| 1016 | 
            +
               "metadata": {
         | 
| 1017 | 
            +
                "id": "bac29114-d226-4f54-97cf-8718c9f94e1e"
         | 
| 1018 | 
            +
               },
         | 
| 1019 | 
            +
               "source": [
         | 
| 1020 | 
            +
                "We can forward the training arguments to the 🤗 Trainer along with our model,\n",
         | 
| 1021 | 
            +
                "dataset, data collator and `compute_metrics` function:"
         | 
| 1022 | 
            +
               ]
         | 
| 1023 | 
            +
              },
         | 
| 1024 | 
            +
              {
         | 
| 1025 | 
            +
               "cell_type": "code",
         | 
| 1026 | 
            +
               "execution_count": null,
         | 
| 1027 | 
            +
               "id": "d546d7fe-0543-479a-b708-2ebabec19493",
         | 
| 1028 | 
            +
               "metadata": {
         | 
| 1029 | 
            +
                "id": "d546d7fe-0543-479a-b708-2ebabec19493"
         | 
| 1030 | 
            +
               },
         | 
| 1031 | 
            +
               "outputs": [],
         | 
| 1032 | 
            +
               "source": [
         | 
| 1033 | 
            +
                "from transformers import Seq2SeqTrainer\n",
         | 
| 1034 | 
            +
                "\n",
         | 
| 1035 | 
            +
                "trainer = Seq2SeqTrainer(\n",
         | 
| 1036 | 
            +
                "    args=training_args,\n",
         | 
| 1037 | 
            +
                "    model=model,\n",
         | 
| 1038 | 
            +
                "    train_dataset=common_voice[\"train\"],\n",
         | 
| 1039 | 
            +
                "    eval_dataset=common_voice[\"test\"],\n",
         | 
| 1040 | 
            +
                "    data_collator=data_collator,\n",
         | 
| 1041 | 
            +
                "    compute_metrics=compute_metrics,\n",
         | 
| 1042 | 
            +
                "    tokenizer=processor.feature_extractor,\n",
         | 
| 1043 | 
            +
                ")"
         | 
| 1044 | 
            +
               ]
         | 
| 1045 | 
            +
              },
         | 
| 1046 | 
            +
              {
         | 
| 1047 | 
            +
               "cell_type": "markdown",
         | 
| 1048 | 
            +
               "id": "uOrRhDGtN5S4",
         | 
| 1049 | 
            +
               "metadata": {
         | 
| 1050 | 
            +
                "id": "uOrRhDGtN5S4"
         | 
| 1051 | 
            +
               },
         | 
| 1052 | 
            +
               "source": [
         | 
| 1053 | 
            +
                "We'll save the processor object once before starting training. Since the processor is not trainable, it won't change over the course of training:"
         | 
| 1054 | 
            +
               ]
         | 
| 1055 | 
            +
              },
         | 
| 1056 | 
            +
              {
         | 
| 1057 | 
            +
               "cell_type": "code",
         | 
| 1058 | 
            +
               "execution_count": null,
         | 
| 1059 | 
            +
               "id": "-2zQwMfEOBJq",
         | 
| 1060 | 
            +
               "metadata": {
         | 
| 1061 | 
            +
                "id": "-2zQwMfEOBJq"
         | 
| 1062 | 
            +
               },
         | 
| 1063 | 
            +
               "outputs": [],
         | 
| 1064 | 
            +
               "source": [
         | 
| 1065 | 
            +
                "processor.save_pretrained(training_args.output_dir)"
         | 
| 1066 | 
            +
               ]
         | 
| 1067 | 
            +
              },
         | 
| 1068 | 
            +
              {
         | 
| 1069 | 
            +
               "cell_type": "markdown",
         | 
| 1070 | 
            +
               "id": "7f404cf9-4345-468c-8196-4bd101d9bd51",
         | 
| 1071 | 
            +
               "metadata": {
         | 
| 1072 | 
            +
                "id": "7f404cf9-4345-468c-8196-4bd101d9bd51"
         | 
| 1073 | 
            +
               },
         | 
| 1074 | 
            +
               "source": [
         | 
| 1075 | 
            +
                "### Training"
         | 
| 1076 | 
            +
               ]
         | 
| 1077 | 
            +
              },
         | 
| 1078 | 
            +
              {
         | 
| 1079 | 
            +
               "cell_type": "markdown",
         | 
| 1080 | 
            +
               "id": "5e8b8d56-5a70-4f68-bd2e-f0752d0bd112",
         | 
| 1081 | 
            +
               "metadata": {
         | 
| 1082 | 
            +
                "id": "5e8b8d56-5a70-4f68-bd2e-f0752d0bd112"
         | 
| 1083 | 
            +
               },
         | 
| 1084 | 
            +
               "source": [
         | 
| 1085 | 
            +
                "Training will take approximately 5-10 hours depending on your GPU. The peak GPU memory for the given training configuration is approximately 36GB. \n",
         | 
| 1086 | 
            +
                "Depending on your GPU, it is possible that you will encounter a CUDA `\"out-of-memory\"` error when you launch training. \n",
         | 
| 1087 | 
            +
                "In this case, you can reduce the `per_device_train_batch_size` incrementally by factors of 2 \n",
         | 
| 1088 | 
            +
                "and employ [`gradient_accumulation_steps`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments.gradient_accumulation_steps)\n",
         | 
| 1089 | 
            +
                "to compensate.\n",
         | 
| 1090 | 
            +
                "\n",
         | 
| 1091 | 
            +
                "To launch training, simply execute:"
         | 
| 1092 | 
            +
               ]
         | 
| 1093 | 
            +
              },
         | 
| 1094 | 
            +
              {
         | 
| 1095 | 
            +
               "cell_type": "code",
         | 
| 1096 | 
            +
               "execution_count": null,
         | 
| 1097 | 
            +
               "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
         | 
| 1098 | 
            +
               "metadata": {
         | 
| 1099 | 
            +
                "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de"
         | 
| 1100 | 
            +
               },
         | 
| 1101 | 
            +
               "outputs": [],
         | 
| 1102 | 
            +
               "source": [
         | 
| 1103 | 
            +
                "trainer.train()"
         | 
| 1104 | 
            +
               ]
         | 
| 1105 | 
            +
              },
         | 
| 1106 | 
            +
              {
         | 
| 1107 | 
            +
               "cell_type": "markdown",
         | 
| 1108 | 
            +
               "id": "810ced54-7187-4a06-b2fe-ba6dcca94dc3",
         | 
| 1109 | 
            +
               "metadata": {
         | 
| 1110 | 
            +
                "id": "810ced54-7187-4a06-b2fe-ba6dcca94dc3"
         | 
| 1111 | 
            +
               },
         | 
| 1112 | 
            +
               "source": [
         | 
| 1113 | 
            +
                "We can label our checkpoint with the `whisper-event` tag on push by setting the appropriate key-word arguments (kwargs):"
         | 
| 1114 | 
            +
               ]
         | 
| 1115 | 
            +
              },
         | 
| 1116 | 
            +
              {
         | 
| 1117 | 
            +
               "cell_type": "code",
         | 
| 1118 | 
            +
               "execution_count": null,
         | 
| 1119 | 
            +
               "id": "c704f91e-241b-48c9-b8e0-f0da396a9663",
         | 
| 1120 | 
            +
               "metadata": {
         | 
| 1121 | 
            +
                "id": "c704f91e-241b-48c9-b8e0-f0da396a9663"
         | 
| 1122 | 
            +
               },
         | 
| 1123 | 
            +
               "outputs": [],
         | 
| 1124 | 
            +
               "source": [
         | 
| 1125 | 
            +
                "kwargs = {\n",
         | 
| 1126 | 
            +
                "    \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
         | 
| 1127 | 
            +
                "    \"dataset\": \"Common Voice 11.0\",  # a 'pretty' name for the training dataset\n",
         | 
| 1128 | 
            +
                "    \"language\": \"hi\",\n",
         | 
| 1129 | 
            +
                "    \"model_name\": \"Whisper Small Hi - Sanchit Gandhi\",  # a 'pretty' name for your model\n",
         | 
| 1130 | 
            +
                "    \"finetuned_from\": \"openai/whisper-small\",\n",
         | 
| 1131 | 
            +
                "    \"tasks\": \"automatic-speech-recognition\",\n",
         | 
| 1132 | 
            +
                "    \"tags\": \"whisper-event\",\n",
         | 
| 1133 | 
            +
                "}"
         | 
| 1134 | 
            +
               ]
         | 
| 1135 | 
            +
              },
         | 
| 1136 | 
            +
              {
         | 
| 1137 | 
            +
               "cell_type": "markdown",
         | 
| 1138 | 
            +
               "id": "090d676a-f944-4297-a938-a40eda0b2b68",
         | 
| 1139 | 
            +
               "metadata": {
         | 
| 1140 | 
            +
                "id": "090d676a-f944-4297-a938-a40eda0b2b68"
         | 
| 1141 | 
            +
               },
         | 
| 1142 | 
            +
               "source": [
         | 
| 1143 | 
            +
                "The training results can now be uploaded to the Hub. To do so, execute the `push_to_hub` command and save the preprocessor object we created:"
         | 
| 1144 | 
            +
               ]
         | 
| 1145 | 
            +
              },
         | 
| 1146 | 
            +
              {
         | 
| 1147 | 
            +
               "cell_type": "code",
         | 
| 1148 | 
            +
               "execution_count": null,
         | 
| 1149 | 
            +
               "id": "d7030622-caf7-4039-939b-6195cdaa2585",
         | 
| 1150 | 
            +
               "metadata": {
         | 
| 1151 | 
            +
                "id": "d7030622-caf7-4039-939b-6195cdaa2585"
         | 
| 1152 | 
            +
               },
         | 
| 1153 | 
            +
               "outputs": [],
         | 
| 1154 | 
            +
               "source": [
         | 
| 1155 | 
            +
                "trainer.push_to_hub(**kwargs)"
         | 
| 1156 | 
            +
               ]
         | 
| 1157 | 
            +
              },
         | 
| 1158 | 
            +
              {
         | 
| 1159 | 
            +
               "cell_type": "markdown",
         | 
| 1160 | 
            +
               "id": "ca743fbd-602c-48d4-ba8d-a2fe60af64ba",
         | 
| 1161 | 
            +
               "metadata": {
         | 
| 1162 | 
            +
                "id": "ca743fbd-602c-48d4-ba8d-a2fe60af64ba"
         | 
| 1163 | 
            +
               },
         | 
| 1164 | 
            +
               "source": [
         | 
| 1165 | 
            +
                "## Closing Remarks"
         | 
| 1166 | 
            +
               ]
         | 
| 1167 | 
            +
              },
         | 
| 1168 | 
            +
              {
         | 
| 1169 | 
            +
               "cell_type": "markdown",
         | 
| 1170 | 
            +
               "id": "7f737783-2870-4e35-aa11-86a42d7d997a",
         | 
| 1171 | 
            +
               "metadata": {
         | 
| 1172 | 
            +
                "id": "7f737783-2870-4e35-aa11-86a42d7d997a"
         | 
| 1173 | 
            +
               },
         | 
| 1174 | 
            +
               "source": [
         | 
| 1175 | 
            +
                "In this blog, we covered a step-by-step guide on fine-tuning Whisper for multilingual ASR \n",
         | 
| 1176 | 
            +
                "using 🤗 Datasets, Transformers and the Hugging Face Hub. For more details on the Whisper model, the Common Voice dataset and the theory behind fine-tuning, refere to the accompanying [blog post](https://huggingface.co/blog/fine-tune-whisper). If you're interested in fine-tuning other \n",
         | 
| 1177 | 
            +
                "Transformers models, both for English and multilingual ASR, be sure to check out the \n",
         | 
| 1178 | 
            +
                "examples scripts at [examples/pytorch/speech-recognition](https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-recognition)."
         | 
| 1179 | 
            +
               ]
         | 
| 1180 | 
            +
              }
         | 
| 1181 | 
            +
             ],
         | 
| 1182 | 
            +
             "metadata": {
         | 
| 1183 | 
            +
              "colab": {
         | 
| 1184 | 
            +
               "include_colab_link": true,
         | 
| 1185 | 
            +
               "provenance": []
         | 
| 1186 | 
            +
              },
         | 
| 1187 | 
            +
              "kernelspec": {
         | 
| 1188 | 
            +
               "display_name": "Python 3 (ipykernel)",
         | 
| 1189 | 
            +
               "language": "python",
         | 
| 1190 | 
            +
               "name": "python3"
         | 
| 1191 | 
            +
              },
         | 
| 1192 | 
            +
              "language_info": {
         | 
| 1193 | 
            +
               "codemirror_mode": {
         | 
| 1194 | 
            +
                "name": "ipython",
         | 
| 1195 | 
            +
                "version": 3
         | 
| 1196 | 
            +
               },
         | 
| 1197 | 
            +
               "file_extension": ".py",
         | 
| 1198 | 
            +
               "mimetype": "text/x-python",
         | 
| 1199 | 
            +
               "name": "python",
         | 
| 1200 | 
            +
               "nbconvert_exporter": "python",
         | 
| 1201 | 
            +
               "pygments_lexer": "ipython3",
         | 
| 1202 | 
            +
               "version": "3.8.9"
         | 
| 1203 | 
            +
              }
         | 
| 1204 | 
            +
             },
         | 
| 1205 | 
            +
             "nbformat": 4,
         | 
| 1206 | 
            +
             "nbformat_minor": 5
         | 
| 1207 | 
            +
            }
         | 
    	
        merges.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        mgb2_speech.py
    ADDED
    
    | @@ -0,0 +1,152 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import datasets
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
             | 
| 4 | 
            +
             | 
| 5 | 
            +
            _DESCRIPTION = "MGB2 speech recognition dataset AR"
         | 
| 6 | 
            +
            _HOMEPAGE = "https://arabicspeech.org/mgb2/"
         | 
| 7 | 
            +
            _LICENSE = "MGB-2 License agreement"
         | 
| 8 | 
            +
            _CITATION = """@misc{https://doi.org/10.48550/arxiv.1609.05625,
         | 
| 9 | 
            +
              doi = {10.48550/ARXIV.1609.05625},
         | 
| 10 | 
            +
              
         | 
| 11 | 
            +
              url = {https://arxiv.org/abs/1609.05625},
         | 
| 12 | 
            +
              
         | 
| 13 | 
            +
              author = {Ali, Ahmed and Bell, Peter and Glass, James and Messaoui, Yacine and Mubarak, Hamdy and Renals, Steve and Zhang, Yifan},
         | 
| 14 | 
            +
              
         | 
| 15 | 
            +
              keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
         | 
| 16 | 
            +
              
         | 
| 17 | 
            +
              title = {The MGB-2 Challenge: Arabic Multi-Dialect Broadcast Media Recognition},
         | 
| 18 | 
            +
              
         | 
| 19 | 
            +
              publisher = {arXiv},
         | 
| 20 | 
            +
              
         | 
| 21 | 
            +
              year = {2016},
         | 
| 22 | 
            +
              
         | 
| 23 | 
            +
              copyright = {arXiv.org perpetual, non-exclusive license}
         | 
| 24 | 
            +
            }
         | 
| 25 | 
            +
            """
         | 
| 26 | 
            +
            _DATA_ARCHIVE_ROOT = "archives/"
         | 
| 27 | 
            +
            _DATA_URL = {
         | 
| 28 | 
            +
                "test": _DATA_ARCHIVE_ROOT + "mgb2_wav.test.tar.gz",
         | 
| 29 | 
            +
                "dev": _DATA_ARCHIVE_ROOT + "mgb2_wav.dev.tar.gz",
         | 
| 30 | 
            +
                "train": [_DATA_ARCHIVE_ROOT + f"mgb2_wav_{x}.train.tar.gz" for x in range(48)], # we have 48 archives
         | 
| 31 | 
            +
            }
         | 
| 32 | 
            +
            _TEXT_URL = {
         | 
| 33 | 
            +
                "test": _DATA_ARCHIVE_ROOT + "mgb2_txt.test.tar.gz",
         | 
| 34 | 
            +
                "dev": _DATA_ARCHIVE_ROOT + "mgb2_txt.dev.tar.gz",
         | 
| 35 | 
            +
                "train": _DATA_ARCHIVE_ROOT + "mgb2_txt.train.tar.gz",
         | 
| 36 | 
            +
            }
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            class MGDB2Dataset(datasets.GeneratorBasedBuilder):
         | 
| 39 | 
            +
                def _info(self):
         | 
| 40 | 
            +
                    return datasets.DatasetInfo(
         | 
| 41 | 
            +
                    description=_DESCRIPTION,
         | 
| 42 | 
            +
                    features=datasets.Features(
         | 
| 43 | 
            +
                        {
         | 
| 44 | 
            +
                            "path": datasets.Value("string"),
         | 
| 45 | 
            +
                            "audio": datasets.Audio(sampling_rate=16_000),
         | 
| 46 | 
            +
                            "text": datasets.Value("string"),
         | 
| 47 | 
            +
                        }
         | 
| 48 | 
            +
                    ),
         | 
| 49 | 
            +
                    supervised_keys=None,
         | 
| 50 | 
            +
                    homepage=_HOMEPAGE,
         | 
| 51 | 
            +
                    license=_LICENSE,
         | 
| 52 | 
            +
                    citation=_CITATION,
         | 
| 53 | 
            +
                )
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                def _split_generators(self, dl_manager):
         | 
| 56 | 
            +
                    wav_archive = dl_manager.download(_DATA_URL)
         | 
| 57 | 
            +
                    txt_archive = dl_manager.download(_TEXT_URL)
         | 
| 58 | 
            +
                    test_dir = "dataset/test"
         | 
| 59 | 
            +
                    dev_dir = "dataset/dev"
         | 
| 60 | 
            +
                    train_dir = "dataset/train"
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                    if dl_manager.is_streaming:
         | 
| 63 | 
            +
                
         | 
| 64 | 
            +
                        return [
         | 
| 65 | 
            +
                        datasets.SplitGenerator(
         | 
| 66 | 
            +
                            name=datasets.Split.TEST,
         | 
| 67 | 
            +
                            gen_kwargs={
         | 
| 68 | 
            +
                                "path_to_txt": test_dir + "/txt",
         | 
| 69 | 
            +
                                "path_to_wav": test_dir + "/wav",
         | 
| 70 | 
            +
                                "wav_files": [dl_manager.iter_archive(wav_archive['test'])],
         | 
| 71 | 
            +
                                "txt_files": dl_manager.iter_archive(txt_archive['test']),
         | 
| 72 | 
            +
                            },
         | 
| 73 | 
            +
                        ),
         | 
| 74 | 
            +
                        datasets.SplitGenerator(
         | 
| 75 | 
            +
                            name=datasets.Split.VALIDATION,
         | 
| 76 | 
            +
                            gen_kwargs={
         | 
| 77 | 
            +
                                "path_to_txt": dev_dir + "/txt",
         | 
| 78 | 
            +
                                "path_to_wav": dev_dir + "/wav",
         | 
| 79 | 
            +
                                "wav_files": [dl_manager.iter_archive(wav_archive['dev'])],
         | 
| 80 | 
            +
                                "txt_files": dl_manager.iter_archive(txt_archive['dev']),
         | 
| 81 | 
            +
                            },
         | 
| 82 | 
            +
                        ),
         | 
| 83 | 
            +
                        datasets.SplitGenerator(
         | 
| 84 | 
            +
                            name=datasets.Split.TRAIN,
         | 
| 85 | 
            +
                            gen_kwargs={
         | 
| 86 | 
            +
                                "path_to_txt": train_dir + "/txt",
         | 
| 87 | 
            +
                                "path_to_wav": train_dir + "/wav",
         | 
| 88 | 
            +
                                "wav_files": [dl_manager.iter_archive(a) for a in wav_archive['train']],
         | 
| 89 | 
            +
                                "txt_files": dl_manager.iter_archive(txt_archive['train']),
         | 
| 90 | 
            +
                            },
         | 
| 91 | 
            +
                        ),
         | 
| 92 | 
            +
                    ]
         | 
| 93 | 
            +
                    else:
         | 
| 94 | 
            +
                        return [
         | 
| 95 | 
            +
                        datasets.SplitGenerator(
         | 
| 96 | 
            +
                            name=datasets.Split.TEST,
         | 
| 97 | 
            +
                            gen_kwargs={
         | 
| 98 | 
            +
                                "path_to_txt": test_dir + "/txt",
         | 
| 99 | 
            +
                                "path_to_wav": test_dir + "/wav",
         | 
| 100 | 
            +
                                "wav_files": [dl_manager.extract(wav_archive['test'])],
         | 
| 101 | 
            +
                                "txt_files": dl_manager.extract(txt_archive['test']),
         | 
| 102 | 
            +
                            },
         | 
| 103 | 
            +
                        ),
         | 
| 104 | 
            +
                        datasets.SplitGenerator(
         | 
| 105 | 
            +
                            name=datasets.Split.VALIDATION,
         | 
| 106 | 
            +
                            gen_kwargs={
         | 
| 107 | 
            +
                                "path_to_txt": dev_dir + "/txt",
         | 
| 108 | 
            +
                                "path_to_wav": dev_dir + "/wav",
         | 
| 109 | 
            +
                                "wav_files": [dl_manager.extract(wav_archive['dev'])],
         | 
| 110 | 
            +
                                "txt_files": dl_manager.extract(txt_archive['dev']),
         | 
| 111 | 
            +
                            },
         | 
| 112 | 
            +
                        ),
         | 
| 113 | 
            +
                        datasets.SplitGenerator(
         | 
| 114 | 
            +
                            name=datasets.Split.TRAIN,
         | 
| 115 | 
            +
                            gen_kwargs={
         | 
| 116 | 
            +
                                "path_to_txt": train_dir + "/txt",
         | 
| 117 | 
            +
                                "path_to_wav": train_dir + "/wav",
         | 
| 118 | 
            +
                                "wav_files": [dl_manager.extract(a) for a in wav_archive['train']],
         | 
| 119 | 
            +
                                "txt_files": dl_manager.extract(txt_archive['train']),
         | 
| 120 | 
            +
                            },
         | 
| 121 | 
            +
                        ),
         | 
| 122 | 
            +
                    ]
         | 
| 123 | 
            +
             | 
| 124 | 
            +
             | 
| 125 | 
            +
                
         | 
| 126 | 
            +
                def _generate_examples(self, path_to_txt, path_to_wav, wav_files, txt_files):
         | 
| 127 | 
            +
                    """ 
         | 
| 128 | 
            +
                    This assumes that the text directory alphabetically precedes the wav dir
         | 
| 129 | 
            +
                    The file names for wav and text seem to match and are unique
         | 
| 130 | 
            +
                    We can use them for the dictionary matching them
         | 
| 131 | 
            +
                    """
         | 
| 132 | 
            +
                    examples = {}
         | 
| 133 | 
            +
                    id_ = 0
         | 
| 134 | 
            +
                    # need to prepare the transcript - wave map
         | 
| 135 | 
            +
                    for path, f in txt_files:
         | 
| 136 | 
            +
                        if path.find(path_to_txt) > -1:
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                            wav_path = os.path.split(path)[1].replace("_utf8", "").replace(".txt", ".wav").strip()
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                            txt = f.read().decode(encoding="utf-8").strip()
         | 
| 141 | 
            +
                            examples[wav_path] = {
         | 
| 142 | 
            +
                                "text": txt,
         | 
| 143 | 
            +
                                "path": wav_path,
         | 
| 144 | 
            +
                            }
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                    for wf in wav_files:
         | 
| 147 | 
            +
                        for path, f in wf:
         | 
| 148 | 
            +
                            if path.find(path_to_wav) > -1:
         | 
| 149 | 
            +
                                wav_path = os.path.split(path)[1].strip()
         | 
| 150 | 
            +
                                audio = {"path": path, "bytes": f.read()}
         | 
| 151 | 
            +
                                yield id_, {**examples[wav_path], "audio": audio}
         | 
| 152 | 
            +
                                id_ += 1
         | 
    	
        normalizer.json
    ADDED
    
    | @@ -0,0 +1,1742 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "accessorise": "accessorize",
         | 
| 3 | 
            +
              "accessorised": "accessorized",
         | 
| 4 | 
            +
              "accessorises": "accessorizes",
         | 
| 5 | 
            +
              "accessorising": "accessorizing",
         | 
| 6 | 
            +
              "acclimatisation": "acclimatization",
         | 
| 7 | 
            +
              "acclimatise": "acclimatize",
         | 
| 8 | 
            +
              "acclimatised": "acclimatized",
         | 
| 9 | 
            +
              "acclimatises": "acclimatizes",
         | 
| 10 | 
            +
              "acclimatising": "acclimatizing",
         | 
| 11 | 
            +
              "accoutrements": "accouterments",
         | 
| 12 | 
            +
              "aeon": "eon",
         | 
| 13 | 
            +
              "aeons": "eons",
         | 
| 14 | 
            +
              "aerogramme": "aerogram",
         | 
| 15 | 
            +
              "aerogrammes": "aerograms",
         | 
| 16 | 
            +
              "aeroplane": "airplane",
         | 
| 17 | 
            +
              "aeroplanes": "airplanes",
         | 
| 18 | 
            +
              "aesthete": "esthete",
         | 
| 19 | 
            +
              "aesthetes": "esthetes",
         | 
| 20 | 
            +
              "aesthetic": "esthetic",
         | 
| 21 | 
            +
              "aesthetically": "esthetically",
         | 
| 22 | 
            +
              "aesthetics": "esthetics",
         | 
| 23 | 
            +
              "aetiology": "etiology",
         | 
| 24 | 
            +
              "ageing": "aging",
         | 
| 25 | 
            +
              "aggrandisement": "aggrandizement",
         | 
| 26 | 
            +
              "agonise": "agonize",
         | 
| 27 | 
            +
              "agonised": "agonized",
         | 
| 28 | 
            +
              "agonises": "agonizes",
         | 
| 29 | 
            +
              "agonising": "agonizing",
         | 
| 30 | 
            +
              "agonisingly": "agonizingly",
         | 
| 31 | 
            +
              "almanack": "almanac",
         | 
| 32 | 
            +
              "almanacks": "almanacs",
         | 
| 33 | 
            +
              "aluminium": "aluminum",
         | 
| 34 | 
            +
              "amortisable": "amortizable",
         | 
| 35 | 
            +
              "amortisation": "amortization",
         | 
| 36 | 
            +
              "amortisations": "amortizations",
         | 
| 37 | 
            +
              "amortise": "amortize",
         | 
| 38 | 
            +
              "amortised": "amortized",
         | 
| 39 | 
            +
              "amortises": "amortizes",
         | 
| 40 | 
            +
              "amortising": "amortizing",
         | 
| 41 | 
            +
              "amphitheatre": "amphitheater",
         | 
| 42 | 
            +
              "amphitheatres": "amphitheaters",
         | 
| 43 | 
            +
              "anaemia": "anemia",
         | 
| 44 | 
            +
              "anaemic": "anemic",
         | 
| 45 | 
            +
              "anaesthesia": "anesthesia",
         | 
| 46 | 
            +
              "anaesthetic": "anesthetic",
         | 
| 47 | 
            +
              "anaesthetics": "anesthetics",
         | 
| 48 | 
            +
              "anaesthetise": "anesthetize",
         | 
| 49 | 
            +
              "anaesthetised": "anesthetized",
         | 
| 50 | 
            +
              "anaesthetises": "anesthetizes",
         | 
| 51 | 
            +
              "anaesthetising": "anesthetizing",
         | 
| 52 | 
            +
              "anaesthetist": "anesthetist",
         | 
| 53 | 
            +
              "anaesthetists": "anesthetists",
         | 
| 54 | 
            +
              "anaesthetize": "anesthetize",
         | 
| 55 | 
            +
              "anaesthetized": "anesthetized",
         | 
| 56 | 
            +
              "anaesthetizes": "anesthetizes",
         | 
| 57 | 
            +
              "anaesthetizing": "anesthetizing",
         | 
| 58 | 
            +
              "analogue": "analog",
         | 
| 59 | 
            +
              "analogues": "analogs",
         | 
| 60 | 
            +
              "analyse": "analyze",
         | 
| 61 | 
            +
              "analysed": "analyzed",
         | 
| 62 | 
            +
              "analyses": "analyzes",
         | 
| 63 | 
            +
              "analysing": "analyzing",
         | 
| 64 | 
            +
              "anglicise": "anglicize",
         | 
| 65 | 
            +
              "anglicised": "anglicized",
         | 
| 66 | 
            +
              "anglicises": "anglicizes",
         | 
| 67 | 
            +
              "anglicising": "anglicizing",
         | 
| 68 | 
            +
              "annualised": "annualized",
         | 
| 69 | 
            +
              "antagonise": "antagonize",
         | 
| 70 | 
            +
              "antagonised": "antagonized",
         | 
| 71 | 
            +
              "antagonises": "antagonizes",
         | 
| 72 | 
            +
              "antagonising": "antagonizing",
         | 
| 73 | 
            +
              "apologise": "apologize",
         | 
| 74 | 
            +
              "apologised": "apologized",
         | 
| 75 | 
            +
              "apologises": "apologizes",
         | 
| 76 | 
            +
              "apologising": "apologizing",
         | 
| 77 | 
            +
              "appal": "appall",
         | 
| 78 | 
            +
              "appals": "appalls",
         | 
| 79 | 
            +
              "appetiser": "appetizer",
         | 
| 80 | 
            +
              "appetisers": "appetizers",
         | 
| 81 | 
            +
              "appetising": "appetizing",
         | 
| 82 | 
            +
              "appetisingly": "appetizingly",
         | 
| 83 | 
            +
              "arbour": "arbor",
         | 
| 84 | 
            +
              "arbours": "arbors",
         | 
| 85 | 
            +
              "archaeologically": "archeologically",
         | 
| 86 | 
            +
              "archaeologist": "archeologist",
         | 
| 87 | 
            +
              "archaeologists": "archeologists",
         | 
| 88 | 
            +
              "archaeology": "archeology</span>",
         | 
| 89 | 
            +
              "archeological": "archaeological",
         | 
| 90 | 
            +
              "ardour": "ardor",
         | 
| 91 | 
            +
              "armour": "armor",
         | 
| 92 | 
            +
              "armoured": "armored",
         | 
| 93 | 
            +
              "armourer": "armorer",
         | 
| 94 | 
            +
              "armourers": "armorers",
         | 
| 95 | 
            +
              "armouries": "armories",
         | 
| 96 | 
            +
              "armoury": "armory",
         | 
| 97 | 
            +
              "artefact": "artifact",
         | 
| 98 | 
            +
              "artefacts": "artifacts",
         | 
| 99 | 
            +
              "authorise": "authorize",
         | 
| 100 | 
            +
              "authorised": "authorized",
         | 
| 101 | 
            +
              "authorises": "authorizes",
         | 
| 102 | 
            +
              "authorising": "authorizing",
         | 
| 103 | 
            +
              "axe": "ax",
         | 
| 104 | 
            +
              "backpedalled": "backpedaled",
         | 
| 105 | 
            +
              "backpedalling": "backpedaling",
         | 
| 106 | 
            +
              "bannister": "banister",
         | 
| 107 | 
            +
              "bannisters": "banisters",
         | 
| 108 | 
            +
              "baptise": "baptize",
         | 
| 109 | 
            +
              "baptised": "baptized",
         | 
| 110 | 
            +
              "baptises": "baptizes",
         | 
| 111 | 
            +
              "baptising": "baptizing",
         | 
| 112 | 
            +
              "bastardise": "bastardize",
         | 
| 113 | 
            +
              "bastardised": "bastardized",
         | 
| 114 | 
            +
              "bastardises": "bastardizes",
         | 
| 115 | 
            +
              "bastardising": "bastardizing",
         | 
| 116 | 
            +
              "battleax": "battleaxe",
         | 
| 117 | 
            +
              "baulk": "balk",
         | 
| 118 | 
            +
              "baulked": "balked",
         | 
| 119 | 
            +
              "baulking": "balking",
         | 
| 120 | 
            +
              "baulks": "balks",
         | 
| 121 | 
            +
              "bedevilled": "bedeviled",
         | 
| 122 | 
            +
              "bedevilling": "bedeviling",
         | 
| 123 | 
            +
              "behaviour": "behavior",
         | 
| 124 | 
            +
              "behavioural": "behavioral",
         | 
| 125 | 
            +
              "behaviourism": "behaviorism",
         | 
| 126 | 
            +
              "behaviourist": "behaviorist",
         | 
| 127 | 
            +
              "behaviourists": "behaviorists",
         | 
| 128 | 
            +
              "behaviours": "behaviors",
         | 
| 129 | 
            +
              "behove": "behoove",
         | 
| 130 | 
            +
              "behoved": "behooved",
         | 
| 131 | 
            +
              "behoves": "behooves",
         | 
| 132 | 
            +
              "bejewelled": "bejeweled",
         | 
| 133 | 
            +
              "belabour": "belabor",
         | 
| 134 | 
            +
              "belaboured": "belabored",
         | 
| 135 | 
            +
              "belabouring": "belaboring",
         | 
| 136 | 
            +
              "belabours": "belabors",
         | 
| 137 | 
            +
              "bevelled": "beveled",
         | 
| 138 | 
            +
              "bevvies": "bevies",
         | 
| 139 | 
            +
              "bevvy": "bevy",
         | 
| 140 | 
            +
              "biassed": "biased",
         | 
| 141 | 
            +
              "biassing": "biasing",
         | 
| 142 | 
            +
              "bingeing": "binging",
         | 
| 143 | 
            +
              "bougainvillaea": "bougainvillea",
         | 
| 144 | 
            +
              "bougainvillaeas": "bougainvilleas",
         | 
| 145 | 
            +
              "bowdlerise": "bowdlerize",
         | 
| 146 | 
            +
              "bowdlerised": "bowdlerized",
         | 
| 147 | 
            +
              "bowdlerises": "bowdlerizes",
         | 
| 148 | 
            +
              "bowdlerising": "bowdlerizing",
         | 
| 149 | 
            +
              "breathalyse": "breathalyze",
         | 
| 150 | 
            +
              "breathalysed": "breathalyzed",
         | 
| 151 | 
            +
              "breathalyser": "breathalyzer",
         | 
| 152 | 
            +
              "breathalysers": "breathalyzers",
         | 
| 153 | 
            +
              "breathalyses": "breathalyzes",
         | 
| 154 | 
            +
              "breathalysing": "breathalyzing",
         | 
| 155 | 
            +
              "brutalise": "brutalize",
         | 
| 156 | 
            +
              "brutalised": "brutalized",
         | 
| 157 | 
            +
              "brutalises": "brutalizes",
         | 
| 158 | 
            +
              "brutalising": "brutalizing",
         | 
| 159 | 
            +
              "busses": "buses",
         | 
| 160 | 
            +
              "bussing": "busing",
         | 
| 161 | 
            +
              "caesarean": "cesarean",
         | 
| 162 | 
            +
              "caesareans": "cesareans",
         | 
| 163 | 
            +
              "calibre": "caliber",
         | 
| 164 | 
            +
              "calibres": "calibers",
         | 
| 165 | 
            +
              "calliper": "caliper",
         | 
| 166 | 
            +
              "callipers": "calipers",
         | 
| 167 | 
            +
              "callisthenics": "calisthenics",
         | 
| 168 | 
            +
              "canalise": "canalize",
         | 
| 169 | 
            +
              "canalised": "canalized",
         | 
| 170 | 
            +
              "canalises": "canalizes",
         | 
| 171 | 
            +
              "canalising": "canalizing",
         | 
| 172 | 
            +
              "cancelation": "cancellation",
         | 
| 173 | 
            +
              "cancelations": "cancellations",
         | 
| 174 | 
            +
              "cancelled": "canceled",
         | 
| 175 | 
            +
              "cancelling": "canceling",
         | 
| 176 | 
            +
              "candour": "candor",
         | 
| 177 | 
            +
              "cannibalise": "cannibalize",
         | 
| 178 | 
            +
              "cannibalised": "cannibalized",
         | 
| 179 | 
            +
              "cannibalises": "cannibalizes",
         | 
| 180 | 
            +
              "cannibalising": "cannibalizing",
         | 
| 181 | 
            +
              "canonise": "canonize",
         | 
| 182 | 
            +
              "canonised": "canonized",
         | 
| 183 | 
            +
              "canonises": "canonizes",
         | 
| 184 | 
            +
              "canonising": "canonizing",
         | 
| 185 | 
            +
              "capitalise": "capitalize",
         | 
| 186 | 
            +
              "capitalised": "capitalized",
         | 
| 187 | 
            +
              "capitalises": "capitalizes",
         | 
| 188 | 
            +
              "capitalising": "capitalizing",
         | 
| 189 | 
            +
              "caramelise": "caramelize",
         | 
| 190 | 
            +
              "caramelised": "caramelized",
         | 
| 191 | 
            +
              "caramelises": "caramelizes",
         | 
| 192 | 
            +
              "caramelising": "caramelizing",
         | 
| 193 | 
            +
              "carbonise": "carbonize",
         | 
| 194 | 
            +
              "carbonised": "carbonized",
         | 
| 195 | 
            +
              "carbonises": "carbonizes",
         | 
| 196 | 
            +
              "carbonising": "carbonizing",
         | 
| 197 | 
            +
              "carolled": "caroled",
         | 
| 198 | 
            +
              "carolling": "caroling",
         | 
| 199 | 
            +
              "catalogue": "catalog",
         | 
| 200 | 
            +
              "catalogued": "cataloged",
         | 
| 201 | 
            +
              "catalogues": "catalogs",
         | 
| 202 | 
            +
              "cataloguing": "cataloging",
         | 
| 203 | 
            +
              "catalyse": "catalyze",
         | 
| 204 | 
            +
              "catalysed": "catalyzed",
         | 
| 205 | 
            +
              "catalyses": "catalyzes",
         | 
| 206 | 
            +
              "catalysing": "catalyzing",
         | 
| 207 | 
            +
              "categorise": "categorize",
         | 
| 208 | 
            +
              "categorised": "categorized",
         | 
| 209 | 
            +
              "categorises": "categorizes",
         | 
| 210 | 
            +
              "categorising": "categorizing",
         | 
| 211 | 
            +
              "cauterise": "cauterize",
         | 
| 212 | 
            +
              "cauterised": "cauterized",
         | 
| 213 | 
            +
              "cauterises": "cauterizes",
         | 
| 214 | 
            +
              "cauterising": "cauterizing",
         | 
| 215 | 
            +
              "cavilled": "caviled",
         | 
| 216 | 
            +
              "cavilling": "caviling",
         | 
| 217 | 
            +
              "centigramme": "centigram",
         | 
| 218 | 
            +
              "centigrammes": "centigrams",
         | 
| 219 | 
            +
              "centilitre": "centiliter",
         | 
| 220 | 
            +
              "centilitres": "centiliters",
         | 
| 221 | 
            +
              "centimetre": "centimeter",
         | 
| 222 | 
            +
              "centimetres": "centimeters",
         | 
| 223 | 
            +
              "centralise": "centralize",
         | 
| 224 | 
            +
              "centralised": "centralized",
         | 
| 225 | 
            +
              "centralises": "centralizes",
         | 
| 226 | 
            +
              "centralising": "centralizing",
         | 
| 227 | 
            +
              "centre": "center",
         | 
| 228 | 
            +
              "centred": "centered",
         | 
| 229 | 
            +
              "centrefold": "centerfold",
         | 
| 230 | 
            +
              "centrefolds": "centerfolds",
         | 
| 231 | 
            +
              "centrepiece": "centerpiece",
         | 
| 232 | 
            +
              "centrepieces": "centerpieces",
         | 
| 233 | 
            +
              "centres": "centers",
         | 
| 234 | 
            +
              "channelled": "channeled",
         | 
| 235 | 
            +
              "channelling": "channeling",
         | 
| 236 | 
            +
              "characterise": "characterize",
         | 
| 237 | 
            +
              "characterised": "characterized",
         | 
| 238 | 
            +
              "characterises": "characterizes",
         | 
| 239 | 
            +
              "characterising": "characterizing",
         | 
| 240 | 
            +
              "cheque": "check",
         | 
| 241 | 
            +
              "chequebook": "checkbook",
         | 
| 242 | 
            +
              "chequebooks": "checkbooks",
         | 
| 243 | 
            +
              "chequered": "checkered",
         | 
| 244 | 
            +
              "cheques": "checks",
         | 
| 245 | 
            +
              "chilli": "chili",
         | 
| 246 | 
            +
              "chimaera": "chimera",
         | 
| 247 | 
            +
              "chimaeras": "chimeras",
         | 
| 248 | 
            +
              "chiselled": "chiseled",
         | 
| 249 | 
            +
              "chiselling": "chiseling",
         | 
| 250 | 
            +
              "circularise": "circularize",
         | 
| 251 | 
            +
              "circularised": "circularized",
         | 
| 252 | 
            +
              "circularises": "circularizes",
         | 
| 253 | 
            +
              "circularising": "circularizing",
         | 
| 254 | 
            +
              "civilise": "civilize",
         | 
| 255 | 
            +
              "civilised": "civilized",
         | 
| 256 | 
            +
              "civilises": "civilizes",
         | 
| 257 | 
            +
              "civilising": "civilizing",
         | 
| 258 | 
            +
              "clamour": "clamor",
         | 
| 259 | 
            +
              "clamoured": "clamored",
         | 
| 260 | 
            +
              "clamouring": "clamoring",
         | 
| 261 | 
            +
              "clamours": "clamors",
         | 
| 262 | 
            +
              "clangour": "clangor",
         | 
| 263 | 
            +
              "clarinettist": "clarinetist",
         | 
| 264 | 
            +
              "clarinettists": "clarinetists",
         | 
| 265 | 
            +
              "collectivise": "collectivize",
         | 
| 266 | 
            +
              "collectivised": "collectivized",
         | 
| 267 | 
            +
              "collectivises": "collectivizes",
         | 
| 268 | 
            +
              "collectivising": "collectivizing",
         | 
| 269 | 
            +
              "colonisation": "colonization",
         | 
| 270 | 
            +
              "colonise": "colonize",
         | 
| 271 | 
            +
              "colonised": "colonized",
         | 
| 272 | 
            +
              "coloniser": "colonizer",
         | 
| 273 | 
            +
              "colonisers": "colonizers",
         | 
| 274 | 
            +
              "colonises": "colonizes",
         | 
| 275 | 
            +
              "colonising": "colonizing",
         | 
| 276 | 
            +
              "colour": "color",
         | 
| 277 | 
            +
              "colourant": "colorant",
         | 
| 278 | 
            +
              "colourants": "colorants",
         | 
| 279 | 
            +
              "coloured": "colored",
         | 
| 280 | 
            +
              "coloureds": "coloreds",
         | 
| 281 | 
            +
              "colourful": "colorful",
         | 
| 282 | 
            +
              "colourfully": "colorfully",
         | 
| 283 | 
            +
              "colouring": "coloring",
         | 
| 284 | 
            +
              "colourize": "colorize",
         | 
| 285 | 
            +
              "colourized": "colorized",
         | 
| 286 | 
            +
              "colourizes": "colorizes",
         | 
| 287 | 
            +
              "colourizing": "colorizing",
         | 
| 288 | 
            +
              "colourless": "colorless",
         | 
| 289 | 
            +
              "colours": "colors",
         | 
| 290 | 
            +
              "commercialise": "commercialize",
         | 
| 291 | 
            +
              "commercialised": "commercialized",
         | 
| 292 | 
            +
              "commercialises": "commercializes",
         | 
| 293 | 
            +
              "commercialising": "commercializing",
         | 
| 294 | 
            +
              "compartmentalise": "compartmentalize",
         | 
| 295 | 
            +
              "compartmentalised": "compartmentalized",
         | 
| 296 | 
            +
              "compartmentalises": "compartmentalizes",
         | 
| 297 | 
            +
              "compartmentalising": "compartmentalizing",
         | 
| 298 | 
            +
              "computerise": "computerize",
         | 
| 299 | 
            +
              "computerised": "computerized",
         | 
| 300 | 
            +
              "computerises": "computerizes",
         | 
| 301 | 
            +
              "computerising": "computerizing",
         | 
| 302 | 
            +
              "conceptualise": "conceptualize",
         | 
| 303 | 
            +
              "conceptualised": "conceptualized",
         | 
| 304 | 
            +
              "conceptualises": "conceptualizes",
         | 
| 305 | 
            +
              "conceptualising": "conceptualizing",
         | 
| 306 | 
            +
              "connexion": "connection",
         | 
| 307 | 
            +
              "connexions": "connections",
         | 
| 308 | 
            +
              "contextualise": "contextualize",
         | 
| 309 | 
            +
              "contextualised": "contextualized",
         | 
| 310 | 
            +
              "contextualises": "contextualizes",
         | 
| 311 | 
            +
              "contextualising": "contextualizing",
         | 
| 312 | 
            +
              "cosier": "cozier",
         | 
| 313 | 
            +
              "cosies": "cozies",
         | 
| 314 | 
            +
              "cosiest": "coziest",
         | 
| 315 | 
            +
              "cosily": "cozily",
         | 
| 316 | 
            +
              "cosiness": "coziness",
         | 
| 317 | 
            +
              "cosy": "cozy",
         | 
| 318 | 
            +
              "councillor": "councilor",
         | 
| 319 | 
            +
              "councillors": "councilors",
         | 
| 320 | 
            +
              "counselled": "counseled",
         | 
| 321 | 
            +
              "counselling": "counseling",
         | 
| 322 | 
            +
              "counsellor": "counselor",
         | 
| 323 | 
            +
              "counsellors": "counselors",
         | 
| 324 | 
            +
              "crenelated": "crenellated",
         | 
| 325 | 
            +
              "criminalise": "criminalize",
         | 
| 326 | 
            +
              "criminalised": "criminalized",
         | 
| 327 | 
            +
              "criminalises": "criminalizes",
         | 
| 328 | 
            +
              "criminalising": "criminalizing",
         | 
| 329 | 
            +
              "criticise": "criticize",
         | 
| 330 | 
            +
              "criticised": "criticized",
         | 
| 331 | 
            +
              "criticises": "criticizes",
         | 
| 332 | 
            +
              "criticising": "criticizing",
         | 
| 333 | 
            +
              "crueller": "crueler",
         | 
| 334 | 
            +
              "cruellest": "cruelest",
         | 
| 335 | 
            +
              "crystallisation": "crystallization",
         | 
| 336 | 
            +
              "crystallise": "crystallize",
         | 
| 337 | 
            +
              "crystallised": "crystallized",
         | 
| 338 | 
            +
              "crystallises": "crystallizes",
         | 
| 339 | 
            +
              "crystallising": "crystallizing",
         | 
| 340 | 
            +
              "cudgelled": "cudgeled",
         | 
| 341 | 
            +
              "cudgelling": "cudgeling",
         | 
| 342 | 
            +
              "customise": "customize",
         | 
| 343 | 
            +
              "customised": "customized",
         | 
| 344 | 
            +
              "customises": "customizes",
         | 
| 345 | 
            +
              "customising": "customizing",
         | 
| 346 | 
            +
              "cypher": "cipher",
         | 
| 347 | 
            +
              "cyphers": "ciphers",
         | 
| 348 | 
            +
              "decentralisation": "decentralization",
         | 
| 349 | 
            +
              "decentralise": "decentralize",
         | 
| 350 | 
            +
              "decentralised": "decentralized",
         | 
| 351 | 
            +
              "decentralises": "decentralizes",
         | 
| 352 | 
            +
              "decentralising": "decentralizing",
         | 
| 353 | 
            +
              "decriminalisation": "decriminalization",
         | 
| 354 | 
            +
              "decriminalise": "decriminalize",
         | 
| 355 | 
            +
              "decriminalised": "decriminalized",
         | 
| 356 | 
            +
              "decriminalises": "decriminalizes",
         | 
| 357 | 
            +
              "decriminalising": "decriminalizing",
         | 
| 358 | 
            +
              "defence": "defense",
         | 
| 359 | 
            +
              "defenceless": "defenseless",
         | 
| 360 | 
            +
              "defences": "defenses",
         | 
| 361 | 
            +
              "dehumanisation": "dehumanization",
         | 
| 362 | 
            +
              "dehumanise": "dehumanize",
         | 
| 363 | 
            +
              "dehumanised": "dehumanized",
         | 
| 364 | 
            +
              "dehumanises": "dehumanizes",
         | 
| 365 | 
            +
              "dehumanising": "dehumanizing",
         | 
| 366 | 
            +
              "demeanour": "demeanor",
         | 
| 367 | 
            +
              "demilitarisation": "demilitarization",
         | 
| 368 | 
            +
              "demilitarise": "demilitarize",
         | 
| 369 | 
            +
              "demilitarised": "demilitarized",
         | 
| 370 | 
            +
              "demilitarises": "demilitarizes",
         | 
| 371 | 
            +
              "demilitarising": "demilitarizing",
         | 
| 372 | 
            +
              "demobilisation": "demobilization",
         | 
| 373 | 
            +
              "demobilise": "demobilize",
         | 
| 374 | 
            +
              "demobilised": "demobilized",
         | 
| 375 | 
            +
              "demobilises": "demobilizes",
         | 
| 376 | 
            +
              "demobilising": "demobilizing",
         | 
| 377 | 
            +
              "democratisation": "democratization",
         | 
| 378 | 
            +
              "democratise": "democratize",
         | 
| 379 | 
            +
              "democratised": "democratized",
         | 
| 380 | 
            +
              "democratises": "democratizes",
         | 
| 381 | 
            +
              "democratising": "democratizing",
         | 
| 382 | 
            +
              "demonise": "demonize",
         | 
| 383 | 
            +
              "demonised": "demonized",
         | 
| 384 | 
            +
              "demonises": "demonizes",
         | 
| 385 | 
            +
              "demonising": "demonizing",
         | 
| 386 | 
            +
              "demoralisation": "demoralization",
         | 
| 387 | 
            +
              "demoralise": "demoralize",
         | 
| 388 | 
            +
              "demoralised": "demoralized",
         | 
| 389 | 
            +
              "demoralises": "demoralizes",
         | 
| 390 | 
            +
              "demoralising": "demoralizing",
         | 
| 391 | 
            +
              "denationalisation": "denationalization",
         | 
| 392 | 
            +
              "denationalise": "denationalize",
         | 
| 393 | 
            +
              "denationalised": "denationalized",
         | 
| 394 | 
            +
              "denationalises": "denationalizes",
         | 
| 395 | 
            +
              "denationalising": "denationalizing",
         | 
| 396 | 
            +
              "deodorise": "deodorize",
         | 
| 397 | 
            +
              "deodorised": "deodorized",
         | 
| 398 | 
            +
              "deodorises": "deodorizes",
         | 
| 399 | 
            +
              "deodorising": "deodorizing",
         | 
| 400 | 
            +
              "depersonalise": "depersonalize",
         | 
| 401 | 
            +
              "depersonalised": "depersonalized",
         | 
| 402 | 
            +
              "depersonalises": "depersonalizes",
         | 
| 403 | 
            +
              "depersonalising": "depersonalizing",
         | 
| 404 | 
            +
              "deputise": "deputize",
         | 
| 405 | 
            +
              "deputised": "deputized",
         | 
| 406 | 
            +
              "deputises": "deputizes",
         | 
| 407 | 
            +
              "deputising": "deputizing",
         | 
| 408 | 
            +
              "desensitisation": "desensitization",
         | 
| 409 | 
            +
              "desensitise": "desensitize",
         | 
| 410 | 
            +
              "desensitised": "desensitized",
         | 
| 411 | 
            +
              "desensitises": "desensitizes",
         | 
| 412 | 
            +
              "desensitising": "desensitizing",
         | 
| 413 | 
            +
              "destabilisation": "destabilization",
         | 
| 414 | 
            +
              "destabilise": "destabilize",
         | 
| 415 | 
            +
              "destabilised": "destabilized",
         | 
| 416 | 
            +
              "destabilises": "destabilizes",
         | 
| 417 | 
            +
              "destabilising": "destabilizing",
         | 
| 418 | 
            +
              "dialled": "dialed",
         | 
| 419 | 
            +
              "dialling": "dialing",
         | 
| 420 | 
            +
              "dialogue": "dialog",
         | 
| 421 | 
            +
              "dialogues": "dialogs",
         | 
| 422 | 
            +
              "diarrhoea": "diarrhea",
         | 
| 423 | 
            +
              "digitise": "digitize",
         | 
| 424 | 
            +
              "digitised": "digitized",
         | 
| 425 | 
            +
              "digitises": "digitizes",
         | 
| 426 | 
            +
              "digitising": "digitizing",
         | 
| 427 | 
            +
              "disc": "disk",
         | 
| 428 | 
            +
              "discolour": "discolor",
         | 
| 429 | 
            +
              "discoloured": "discolored",
         | 
| 430 | 
            +
              "discolouring": "discoloring",
         | 
| 431 | 
            +
              "discolours": "discolors",
         | 
| 432 | 
            +
              "discs": "disks",
         | 
| 433 | 
            +
              "disembowelled": "disemboweled",
         | 
| 434 | 
            +
              "disembowelling": "disemboweling",
         | 
| 435 | 
            +
              "disfavour": "disfavor",
         | 
| 436 | 
            +
              "dishevelled": "disheveled",
         | 
| 437 | 
            +
              "dishonour": "dishonor",
         | 
| 438 | 
            +
              "dishonourable": "dishonorable",
         | 
| 439 | 
            +
              "dishonourably": "dishonorably",
         | 
| 440 | 
            +
              "dishonoured": "dishonored",
         | 
| 441 | 
            +
              "dishonouring": "dishonoring",
         | 
| 442 | 
            +
              "dishonours": "dishonors",
         | 
| 443 | 
            +
              "disorganisation": "disorganization",
         | 
| 444 | 
            +
              "disorganised": "disorganized",
         | 
| 445 | 
            +
              "distil": "distill",
         | 
| 446 | 
            +
              "distils": "distills",
         | 
| 447 | 
            +
              "dramatisation": "dramatization",
         | 
| 448 | 
            +
              "dramatisations": "dramatizations",
         | 
| 449 | 
            +
              "dramatise": "dramatize",
         | 
| 450 | 
            +
              "dramatised": "dramatized",
         | 
| 451 | 
            +
              "dramatises": "dramatizes",
         | 
| 452 | 
            +
              "dramatising": "dramatizing",
         | 
| 453 | 
            +
              "draught": "draft",
         | 
| 454 | 
            +
              "draughtboard": "draftboard",
         | 
| 455 | 
            +
              "draughtboards": "draftboards",
         | 
| 456 | 
            +
              "draughtier": "draftier",
         | 
| 457 | 
            +
              "draughtiest": "draftiest",
         | 
| 458 | 
            +
              "draughts": "drafts",
         | 
| 459 | 
            +
              "draughtsman": "draftsman",
         | 
| 460 | 
            +
              "draughtsmanship": "draftsmanship",
         | 
| 461 | 
            +
              "draughtsmen": "draftsmen",
         | 
| 462 | 
            +
              "draughtswoman": "draftswoman",
         | 
| 463 | 
            +
              "draughtswomen": "draftswomen",
         | 
| 464 | 
            +
              "draughty": "drafty",
         | 
| 465 | 
            +
              "drivelled": "driveled",
         | 
| 466 | 
            +
              "drivelling": "driveling",
         | 
| 467 | 
            +
              "duelled": "dueled",
         | 
| 468 | 
            +
              "duelling": "dueling",
         | 
| 469 | 
            +
              "economise": "economize",
         | 
| 470 | 
            +
              "economised": "economized",
         | 
| 471 | 
            +
              "economises": "economizes",
         | 
| 472 | 
            +
              "economising": "economizing",
         | 
| 473 | 
            +
              "editorialise": "editorialize",
         | 
| 474 | 
            +
              "editorialised": "editorialized",
         | 
| 475 | 
            +
              "editorialises": "editorializes",
         | 
| 476 | 
            +
              "editorialising": "editorializing",
         | 
| 477 | 
            +
              "edoema": "edema",
         | 
| 478 | 
            +
              "empathise": "empathize",
         | 
| 479 | 
            +
              "empathised": "empathized",
         | 
| 480 | 
            +
              "empathises": "empathizes",
         | 
| 481 | 
            +
              "empathising": "empathizing",
         | 
| 482 | 
            +
              "emphasise": "emphasize",
         | 
| 483 | 
            +
              "emphasised": "emphasized",
         | 
| 484 | 
            +
              "emphasises": "emphasizes",
         | 
| 485 | 
            +
              "emphasising": "emphasizing",
         | 
| 486 | 
            +
              "enamelled": "enameled",
         | 
| 487 | 
            +
              "enamelling": "enameling",
         | 
| 488 | 
            +
              "enamoured": "enamored",
         | 
| 489 | 
            +
              "encyclopaedia": "encyclopedia",
         | 
| 490 | 
            +
              "encyclopaedias": "encyclopedias",
         | 
| 491 | 
            +
              "encyclopaedic": "encyclopedic",
         | 
| 492 | 
            +
              "endeavour": "endeavor",
         | 
| 493 | 
            +
              "endeavoured": "endeavored",
         | 
| 494 | 
            +
              "endeavouring": "endeavoring",
         | 
| 495 | 
            +
              "endeavours": "endeavors",
         | 
| 496 | 
            +
              "energise": "energize",
         | 
| 497 | 
            +
              "energised": "energized",
         | 
| 498 | 
            +
              "energises": "energizes",
         | 
| 499 | 
            +
              "energising": "energizing",
         | 
| 500 | 
            +
              "enrol": "enroll",
         | 
| 501 | 
            +
              "enrols": "enrolls",
         | 
| 502 | 
            +
              "enthral": "enthrall",
         | 
| 503 | 
            +
              "enthrals": "enthralls",
         | 
| 504 | 
            +
              "epaulette": "epaulet",
         | 
| 505 | 
            +
              "epaulettes": "epaulets",
         | 
| 506 | 
            +
              "epicentre": "epicenter",
         | 
| 507 | 
            +
              "epicentres": "epicenters",
         | 
| 508 | 
            +
              "epilogue": "epilog",
         | 
| 509 | 
            +
              "epilogues": "epilogs",
         | 
| 510 | 
            +
              "epitomise": "epitomize",
         | 
| 511 | 
            +
              "epitomised": "epitomized",
         | 
| 512 | 
            +
              "epitomises": "epitomizes",
         | 
| 513 | 
            +
              "epitomising": "epitomizing",
         | 
| 514 | 
            +
              "equalisation": "equalization",
         | 
| 515 | 
            +
              "equalise": "equalize",
         | 
| 516 | 
            +
              "equalised": "equalized",
         | 
| 517 | 
            +
              "equaliser": "equalizer",
         | 
| 518 | 
            +
              "equalisers": "equalizers",
         | 
| 519 | 
            +
              "equalises": "equalizes",
         | 
| 520 | 
            +
              "equalising": "equalizing",
         | 
| 521 | 
            +
              "eulogise": "eulogize",
         | 
| 522 | 
            +
              "eulogised": "eulogized",
         | 
| 523 | 
            +
              "eulogises": "eulogizes",
         | 
| 524 | 
            +
              "eulogising": "eulogizing",
         | 
| 525 | 
            +
              "evangelise": "evangelize",
         | 
| 526 | 
            +
              "evangelised": "evangelized",
         | 
| 527 | 
            +
              "evangelises": "evangelizes",
         | 
| 528 | 
            +
              "evangelising": "evangelizing",
         | 
| 529 | 
            +
              "exorcise": "exorcize",
         | 
| 530 | 
            +
              "exorcised": "exorcized",
         | 
| 531 | 
            +
              "exorcises": "exorcizes",
         | 
| 532 | 
            +
              "exorcising": "exorcizing",
         | 
| 533 | 
            +
              "extemporisation": "extemporization",
         | 
| 534 | 
            +
              "extemporise": "extemporize",
         | 
| 535 | 
            +
              "extemporised": "extemporized",
         | 
| 536 | 
            +
              "extemporises": "extemporizes",
         | 
| 537 | 
            +
              "extemporising": "extemporizing",
         | 
| 538 | 
            +
              "externalisation": "externalization",
         | 
| 539 | 
            +
              "externalisations": "externalizations",
         | 
| 540 | 
            +
              "externalise": "externalize",
         | 
| 541 | 
            +
              "externalised": "externalized",
         | 
| 542 | 
            +
              "externalises": "externalizes",
         | 
| 543 | 
            +
              "externalising": "externalizing",
         | 
| 544 | 
            +
              "factorise": "factorize",
         | 
| 545 | 
            +
              "factorised": "factorized",
         | 
| 546 | 
            +
              "factorises": "factorizes",
         | 
| 547 | 
            +
              "factorising": "factorizing",
         | 
| 548 | 
            +
              "faecal": "fecal",
         | 
| 549 | 
            +
              "faeces": "feces",
         | 
| 550 | 
            +
              "familiarisation": "familiarization",
         | 
| 551 | 
            +
              "familiarise": "familiarize",
         | 
| 552 | 
            +
              "familiarised": "familiarized",
         | 
| 553 | 
            +
              "familiarises": "familiarizes",
         | 
| 554 | 
            +
              "familiarising": "familiarizing",
         | 
| 555 | 
            +
              "fantasise": "fantasize",
         | 
| 556 | 
            +
              "fantasised": "fantasized",
         | 
| 557 | 
            +
              "fantasises": "fantasizes",
         | 
| 558 | 
            +
              "fantasising": "fantasizing",
         | 
| 559 | 
            +
              "favour": "favor",
         | 
| 560 | 
            +
              "favourable": "favorable",
         | 
| 561 | 
            +
              "favourably": "favorably",
         | 
| 562 | 
            +
              "favoured": "favored",
         | 
| 563 | 
            +
              "favouring": "favoring",
         | 
| 564 | 
            +
              "favourite": "favorite",
         | 
| 565 | 
            +
              "favourites": "favorites",
         | 
| 566 | 
            +
              "favouritism": "favoritism",
         | 
| 567 | 
            +
              "favours": "favors",
         | 
| 568 | 
            +
              "feminise": "feminize",
         | 
| 569 | 
            +
              "feminised": "feminized",
         | 
| 570 | 
            +
              "feminises": "feminizes",
         | 
| 571 | 
            +
              "feminising": "feminizing",
         | 
| 572 | 
            +
              "fertilisation": "fertilization",
         | 
| 573 | 
            +
              "fertilise": "fertilize",
         | 
| 574 | 
            +
              "fertilised": "fertilized",
         | 
| 575 | 
            +
              "fertiliser": "fertilizer",
         | 
| 576 | 
            +
              "fertilisers": "fertilizers",
         | 
| 577 | 
            +
              "fertilises": "fertilizes",
         | 
| 578 | 
            +
              "fertilising": "fertilizing",
         | 
| 579 | 
            +
              "fervour": "fervor",
         | 
| 580 | 
            +
              "fibre": "fiber",
         | 
| 581 | 
            +
              "fibreglass": "fiberglass",
         | 
| 582 | 
            +
              "fibres": "fibers",
         | 
| 583 | 
            +
              "fictionalisation": "fictionalization",
         | 
| 584 | 
            +
              "fictionalisations": "fictionalizations",
         | 
| 585 | 
            +
              "fictionalise": "fictionalize",
         | 
| 586 | 
            +
              "fictionalised": "fictionalized",
         | 
| 587 | 
            +
              "fictionalises": "fictionalizes",
         | 
| 588 | 
            +
              "fictionalising": "fictionalizing",
         | 
| 589 | 
            +
              "fillet": "filet",
         | 
| 590 | 
            +
              "filleted": "fileted",
         | 
| 591 | 
            +
              "filleting": "fileting",
         | 
| 592 | 
            +
              "fillets": "filets",
         | 
| 593 | 
            +
              "finalisation": "finalization",
         | 
| 594 | 
            +
              "finalise": "finalize",
         | 
| 595 | 
            +
              "finalised": "finalized",
         | 
| 596 | 
            +
              "finalises": "finalizes",
         | 
| 597 | 
            +
              "finalising": "finalizing",
         | 
| 598 | 
            +
              "flautist": "flutist",
         | 
| 599 | 
            +
              "flautists": "flutists",
         | 
| 600 | 
            +
              "flavour": "flavor",
         | 
| 601 | 
            +
              "flavoured": "flavored",
         | 
| 602 | 
            +
              "flavouring": "flavoring",
         | 
| 603 | 
            +
              "flavourings": "flavorings",
         | 
| 604 | 
            +
              "flavourless": "flavorless",
         | 
| 605 | 
            +
              "flavours": "flavors",
         | 
| 606 | 
            +
              "flavoursome": "flavorsome",
         | 
| 607 | 
            +
              "flyer / flier": "flier / flyer",
         | 
| 608 | 
            +
              "foetal": "fetal",
         | 
| 609 | 
            +
              "foetid": "fetid",
         | 
| 610 | 
            +
              "foetus": "fetus",
         | 
| 611 | 
            +
              "foetuses": "fetuses",
         | 
| 612 | 
            +
              "formalisation": "formalization",
         | 
| 613 | 
            +
              "formalise": "formalize",
         | 
| 614 | 
            +
              "formalised": "formalized",
         | 
| 615 | 
            +
              "formalises": "formalizes",
         | 
| 616 | 
            +
              "formalising": "formalizing",
         | 
| 617 | 
            +
              "fossilisation": "fossilization",
         | 
| 618 | 
            +
              "fossilise": "fossilize",
         | 
| 619 | 
            +
              "fossilised": "fossilized",
         | 
| 620 | 
            +
              "fossilises": "fossilizes",
         | 
| 621 | 
            +
              "fossilising": "fossilizing",
         | 
| 622 | 
            +
              "fraternisation": "fraternization",
         | 
| 623 | 
            +
              "fraternise": "fraternize",
         | 
| 624 | 
            +
              "fraternised": "fraternized",
         | 
| 625 | 
            +
              "fraternises": "fraternizes",
         | 
| 626 | 
            +
              "fraternising": "fraternizing",
         | 
| 627 | 
            +
              "fulfil": "fulfill",
         | 
| 628 | 
            +
              "fulfilment": "fulfillment",
         | 
| 629 | 
            +
              "fulfils": "fulfills",
         | 
| 630 | 
            +
              "funnelled": "funneled",
         | 
| 631 | 
            +
              "funnelling": "funneling",
         | 
| 632 | 
            +
              "gage": "gauge",
         | 
| 633 | 
            +
              "gaged": "gauged",
         | 
| 634 | 
            +
              "gages": "gauges",
         | 
| 635 | 
            +
              "gaging": "gauging",
         | 
| 636 | 
            +
              "galvanise": "galvanize",
         | 
| 637 | 
            +
              "galvanised": "galvanized",
         | 
| 638 | 
            +
              "galvanises": "galvanizes",
         | 
| 639 | 
            +
              "galvanising": "galvanizing",
         | 
| 640 | 
            +
              "gambolled": "gamboled",
         | 
| 641 | 
            +
              "gambolling": "gamboling",
         | 
| 642 | 
            +
              "gaol": "jail",
         | 
| 643 | 
            +
              "gaolbird": "jailbird",
         | 
| 644 | 
            +
              "gaolbirds": "jailbirds",
         | 
| 645 | 
            +
              "gaolbreak": "jailbreak",
         | 
| 646 | 
            +
              "gaolbreaks": "jailbreaks",
         | 
| 647 | 
            +
              "gaoled": "jailed",
         | 
| 648 | 
            +
              "gaoler": "jailer",
         | 
| 649 | 
            +
              "gaolers": "jailers",
         | 
| 650 | 
            +
              "gaoling": "jailing",
         | 
| 651 | 
            +
              "gaols": "jails",
         | 
| 652 | 
            +
              "gasses": "gases",
         | 
| 653 | 
            +
              "generalisation": "generalization",
         | 
| 654 | 
            +
              "generalisations": "generalizations",
         | 
| 655 | 
            +
              "generalise": "generalize",
         | 
| 656 | 
            +
              "generalised": "generalized",
         | 
| 657 | 
            +
              "generalises": "generalizes",
         | 
| 658 | 
            +
              "generalising": "generalizing",
         | 
| 659 | 
            +
              "ghettoise": "ghettoize",
         | 
| 660 | 
            +
              "ghettoised": "ghettoized",
         | 
| 661 | 
            +
              "ghettoises": "ghettoizes",
         | 
| 662 | 
            +
              "ghettoising": "ghettoizing",
         | 
| 663 | 
            +
              "gipsies": "gypsies",
         | 
| 664 | 
            +
              "glamor": "glamour",
         | 
| 665 | 
            +
              "glamorise": "glamorize",
         | 
| 666 | 
            +
              "glamorised": "glamorized",
         | 
| 667 | 
            +
              "glamorises": "glamorizes",
         | 
| 668 | 
            +
              "glamorising": "glamorizing",
         | 
| 669 | 
            +
              "globalisation": "globalization",
         | 
| 670 | 
            +
              "globalise": "globalize",
         | 
| 671 | 
            +
              "globalised": "globalized",
         | 
| 672 | 
            +
              "globalises": "globalizes",
         | 
| 673 | 
            +
              "globalising": "globalizing",
         | 
| 674 | 
            +
              "glueing": "gluing",
         | 
| 675 | 
            +
              "goitre": "goiter",
         | 
| 676 | 
            +
              "goitres": "goiters",
         | 
| 677 | 
            +
              "gonorrhoea": "gonorrhea",
         | 
| 678 | 
            +
              "gramme": "gram",
         | 
| 679 | 
            +
              "grammes": "grams",
         | 
| 680 | 
            +
              "gravelled": "graveled",
         | 
| 681 | 
            +
              "grey": "gray",
         | 
| 682 | 
            +
              "greyed": "grayed",
         | 
| 683 | 
            +
              "greying": "graying",
         | 
| 684 | 
            +
              "greyish": "grayish",
         | 
| 685 | 
            +
              "greyness": "grayness",
         | 
| 686 | 
            +
              "greys": "grays",
         | 
| 687 | 
            +
              "grovelled": "groveled",
         | 
| 688 | 
            +
              "grovelling": "groveling",
         | 
| 689 | 
            +
              "groyne": "groin",
         | 
| 690 | 
            +
              "groynes": "groins",
         | 
| 691 | 
            +
              "gruelling": "grueling",
         | 
| 692 | 
            +
              "gruellingly": "gruelingly",
         | 
| 693 | 
            +
              "gryphon": "griffin",
         | 
| 694 | 
            +
              "gryphons": "griffins",
         | 
| 695 | 
            +
              "gynaecological": "gynecological",
         | 
| 696 | 
            +
              "gynaecologist": "gynecologist",
         | 
| 697 | 
            +
              "gynaecologists": "gynecologists",
         | 
| 698 | 
            +
              "gynaecology": "gynecology",
         | 
| 699 | 
            +
              "haematological": "hematological",
         | 
| 700 | 
            +
              "haematologist": "hematologist",
         | 
| 701 | 
            +
              "haematologists": "hematologists",
         | 
| 702 | 
            +
              "haematology": "hematology",
         | 
| 703 | 
            +
              "haemoglobin": "hemoglobin",
         | 
| 704 | 
            +
              "haemophilia": "hemophilia",
         | 
| 705 | 
            +
              "haemophiliac": "hemophiliac",
         | 
| 706 | 
            +
              "haemophiliacs": "hemophiliacs",
         | 
| 707 | 
            +
              "haemorrhage": "hemorrhage",
         | 
| 708 | 
            +
              "haemorrhaged": "hemorrhaged",
         | 
| 709 | 
            +
              "haemorrhages": "hemorrhages",
         | 
| 710 | 
            +
              "haemorrhaging": "hemorrhaging",
         | 
| 711 | 
            +
              "haemorrhoids": "hemorrhoids",
         | 
| 712 | 
            +
              "harbour": "harbor",
         | 
| 713 | 
            +
              "harboured": "harbored",
         | 
| 714 | 
            +
              "harbouring": "harboring",
         | 
| 715 | 
            +
              "harbours": "harbors",
         | 
| 716 | 
            +
              "harmonisation": "harmonization",
         | 
| 717 | 
            +
              "harmonise": "harmonize",
         | 
| 718 | 
            +
              "harmonised": "harmonized",
         | 
| 719 | 
            +
              "harmonises": "harmonizes",
         | 
| 720 | 
            +
              "harmonising": "harmonizing",
         | 
| 721 | 
            +
              "homoeopath": "homeopath",
         | 
| 722 | 
            +
              "homoeopathic": "homeopathic",
         | 
| 723 | 
            +
              "homoeopaths": "homeopaths",
         | 
| 724 | 
            +
              "homoeopathy": "homeopathy",
         | 
| 725 | 
            +
              "homogenise": "homogenize",
         | 
| 726 | 
            +
              "homogenised": "homogenized",
         | 
| 727 | 
            +
              "homogenises": "homogenizes",
         | 
| 728 | 
            +
              "homogenising": "homogenizing",
         | 
| 729 | 
            +
              "honour": "honor",
         | 
| 730 | 
            +
              "honourable": "honorable",
         | 
| 731 | 
            +
              "honourably": "honorably",
         | 
| 732 | 
            +
              "honoured": "honored",
         | 
| 733 | 
            +
              "honouring": "honoring",
         | 
| 734 | 
            +
              "honours": "honors",
         | 
| 735 | 
            +
              "hospitalisation": "hospitalization",
         | 
| 736 | 
            +
              "hospitalise": "hospitalize",
         | 
| 737 | 
            +
              "hospitalised": "hospitalized",
         | 
| 738 | 
            +
              "hospitalises": "hospitalizes",
         | 
| 739 | 
            +
              "hospitalising": "hospitalizing",
         | 
| 740 | 
            +
              "humanise": "humanize",
         | 
| 741 | 
            +
              "humanised": "humanized",
         | 
| 742 | 
            +
              "humanises": "humanizes",
         | 
| 743 | 
            +
              "humanising": "humanizing",
         | 
| 744 | 
            +
              "humour": "humor",
         | 
| 745 | 
            +
              "humoured": "humored",
         | 
| 746 | 
            +
              "humouring": "humoring",
         | 
| 747 | 
            +
              "humourless": "humorless",
         | 
| 748 | 
            +
              "humours": "humors",
         | 
| 749 | 
            +
              "hybridise": "hybridize",
         | 
| 750 | 
            +
              "hybridised": "hybridized",
         | 
| 751 | 
            +
              "hybridises": "hybridizes",
         | 
| 752 | 
            +
              "hybridising": "hybridizing",
         | 
| 753 | 
            +
              "hypnotise": "hypnotize",
         | 
| 754 | 
            +
              "hypnotised": "hypnotized",
         | 
| 755 | 
            +
              "hypnotises": "hypnotizes",
         | 
| 756 | 
            +
              "hypnotising": "hypnotizing",
         | 
| 757 | 
            +
              "hypothesise": "hypothesize",
         | 
| 758 | 
            +
              "hypothesised": "hypothesized",
         | 
| 759 | 
            +
              "hypothesises": "hypothesizes",
         | 
| 760 | 
            +
              "hypothesising": "hypothesizing",
         | 
| 761 | 
            +
              "idealisation": "idealization",
         | 
| 762 | 
            +
              "idealise": "idealize",
         | 
| 763 | 
            +
              "idealised": "idealized",
         | 
| 764 | 
            +
              "idealises": "idealizes",
         | 
| 765 | 
            +
              "idealising": "idealizing",
         | 
| 766 | 
            +
              "idolise": "idolize",
         | 
| 767 | 
            +
              "idolised": "idolized",
         | 
| 768 | 
            +
              "idolises": "idolizes",
         | 
| 769 | 
            +
              "idolising": "idolizing",
         | 
| 770 | 
            +
              "immobilisation": "immobilization",
         | 
| 771 | 
            +
              "immobilise": "immobilize",
         | 
| 772 | 
            +
              "immobilised": "immobilized",
         | 
| 773 | 
            +
              "immobiliser": "immobilizer",
         | 
| 774 | 
            +
              "immobilisers": "immobilizers",
         | 
| 775 | 
            +
              "immobilises": "immobilizes",
         | 
| 776 | 
            +
              "immobilising": "immobilizing",
         | 
| 777 | 
            +
              "immortalise": "immortalize",
         | 
| 778 | 
            +
              "immortalised": "immortalized",
         | 
| 779 | 
            +
              "immortalises": "immortalizes",
         | 
| 780 | 
            +
              "immortalising": "immortalizing",
         | 
| 781 | 
            +
              "immunisation": "immunization",
         | 
| 782 | 
            +
              "immunise": "immunize",
         | 
| 783 | 
            +
              "immunised": "immunized",
         | 
| 784 | 
            +
              "immunises": "immunizes",
         | 
| 785 | 
            +
              "immunising": "immunizing",
         | 
| 786 | 
            +
              "impanelled": "impaneled",
         | 
| 787 | 
            +
              "impanelling": "impaneling",
         | 
| 788 | 
            +
              "imperilled": "imperiled",
         | 
| 789 | 
            +
              "imperilling": "imperiling",
         | 
| 790 | 
            +
              "individualise": "individualize",
         | 
| 791 | 
            +
              "individualised": "individualized",
         | 
| 792 | 
            +
              "individualises": "individualizes",
         | 
| 793 | 
            +
              "individualising": "individualizing",
         | 
| 794 | 
            +
              "industrialise": "industrialize",
         | 
| 795 | 
            +
              "industrialised": "industrialized",
         | 
| 796 | 
            +
              "industrialises": "industrializes",
         | 
| 797 | 
            +
              "industrialising": "industrializing",
         | 
| 798 | 
            +
              "inflexion": "inflection",
         | 
| 799 | 
            +
              "inflexions": "inflections",
         | 
| 800 | 
            +
              "initialise": "initialize",
         | 
| 801 | 
            +
              "initialised": "initialized",
         | 
| 802 | 
            +
              "initialises": "initializes",
         | 
| 803 | 
            +
              "initialising": "initializing",
         | 
| 804 | 
            +
              "initialled": "initialed",
         | 
| 805 | 
            +
              "initialling": "initialing",
         | 
| 806 | 
            +
              "instal": "install",
         | 
| 807 | 
            +
              "instalment": "installment",
         | 
| 808 | 
            +
              "instalments": "installments",
         | 
| 809 | 
            +
              "instals": "installs",
         | 
| 810 | 
            +
              "instil": "instill",
         | 
| 811 | 
            +
              "instils": "instills",
         | 
| 812 | 
            +
              "institutionalisation": "institutionalization",
         | 
| 813 | 
            +
              "institutionalise": "institutionalize",
         | 
| 814 | 
            +
              "institutionalised": "institutionalized",
         | 
| 815 | 
            +
              "institutionalises": "institutionalizes",
         | 
| 816 | 
            +
              "institutionalising": "institutionalizing",
         | 
| 817 | 
            +
              "intellectualise": "intellectualize",
         | 
| 818 | 
            +
              "intellectualised": "intellectualized",
         | 
| 819 | 
            +
              "intellectualises": "intellectualizes",
         | 
| 820 | 
            +
              "intellectualising": "intellectualizing",
         | 
| 821 | 
            +
              "internalisation": "internalization",
         | 
| 822 | 
            +
              "internalise": "internalize",
         | 
| 823 | 
            +
              "internalised": "internalized",
         | 
| 824 | 
            +
              "internalises": "internalizes",
         | 
| 825 | 
            +
              "internalising": "internalizing",
         | 
| 826 | 
            +
              "internationalisation": "internationalization",
         | 
| 827 | 
            +
              "internationalise": "internationalize",
         | 
| 828 | 
            +
              "internationalised": "internationalized",
         | 
| 829 | 
            +
              "internationalises": "internationalizes",
         | 
| 830 | 
            +
              "internationalising": "internationalizing",
         | 
| 831 | 
            +
              "ionisation": "ionization",
         | 
| 832 | 
            +
              "ionise": "ionize",
         | 
| 833 | 
            +
              "ionised": "ionized",
         | 
| 834 | 
            +
              "ioniser": "ionizer",
         | 
| 835 | 
            +
              "ionisers": "ionizers",
         | 
| 836 | 
            +
              "ionises": "ionizes",
         | 
| 837 | 
            +
              "ionising": "ionizing",
         | 
| 838 | 
            +
              "italicise": "italicize",
         | 
| 839 | 
            +
              "italicised": "italicized",
         | 
| 840 | 
            +
              "italicises": "italicizes",
         | 
| 841 | 
            +
              "italicising": "italicizing",
         | 
| 842 | 
            +
              "itemise": "itemize",
         | 
| 843 | 
            +
              "itemised": "itemized",
         | 
| 844 | 
            +
              "itemises": "itemizes",
         | 
| 845 | 
            +
              "itemising": "itemizing",
         | 
| 846 | 
            +
              "jeopardise": "jeopardize",
         | 
| 847 | 
            +
              "jeopardised": "jeopardized",
         | 
| 848 | 
            +
              "jeopardises": "jeopardizes",
         | 
| 849 | 
            +
              "jeopardising": "jeopardizing",
         | 
| 850 | 
            +
              "jewelled": "jeweled",
         | 
| 851 | 
            +
              "jeweller": "jeweler",
         | 
| 852 | 
            +
              "jewellers": "jewelers",
         | 
| 853 | 
            +
              "jewellery": "jewelry",
         | 
| 854 | 
            +
              "judgement": "judgment",
         | 
| 855 | 
            +
              "kilogramme": "kilogram",
         | 
| 856 | 
            +
              "kilogrammes": "kilograms",
         | 
| 857 | 
            +
              "kilometre": "kilometer",
         | 
| 858 | 
            +
              "kilometres": "kilometers",
         | 
| 859 | 
            +
              "labelled": "labeled",
         | 
| 860 | 
            +
              "labelling": "labeling",
         | 
| 861 | 
            +
              "labour": "labor",
         | 
| 862 | 
            +
              "laboured": "labored",
         | 
| 863 | 
            +
              "labourer": "laborer",
         | 
| 864 | 
            +
              "labourers": "laborers",
         | 
| 865 | 
            +
              "labouring": "laboring",
         | 
| 866 | 
            +
              "labours": "labors",
         | 
| 867 | 
            +
              "lacklustre": "lackluster",
         | 
| 868 | 
            +
              "legalisation": "legalization",
         | 
| 869 | 
            +
              "legalise": "legalize",
         | 
| 870 | 
            +
              "legalised": "legalized",
         | 
| 871 | 
            +
              "legalises": "legalizes",
         | 
| 872 | 
            +
              "legalising": "legalizing",
         | 
| 873 | 
            +
              "legitimise": "legitimize",
         | 
| 874 | 
            +
              "legitimised": "legitimized",
         | 
| 875 | 
            +
              "legitimises": "legitimizes",
         | 
| 876 | 
            +
              "legitimising": "legitimizing",
         | 
| 877 | 
            +
              "leukaemia": "leukemia",
         | 
| 878 | 
            +
              "levelled": "leveled",
         | 
| 879 | 
            +
              "leveller": "leveler",
         | 
| 880 | 
            +
              "levellers": "levelers",
         | 
| 881 | 
            +
              "levelling": "leveling",
         | 
| 882 | 
            +
              "libelled": "libeled",
         | 
| 883 | 
            +
              "libelling": "libeling",
         | 
| 884 | 
            +
              "libellous": "libelous",
         | 
| 885 | 
            +
              "liberalisation": "liberalization",
         | 
| 886 | 
            +
              "liberalise": "liberalize",
         | 
| 887 | 
            +
              "liberalised": "liberalized",
         | 
| 888 | 
            +
              "liberalises": "liberalizes",
         | 
| 889 | 
            +
              "liberalising": "liberalizing",
         | 
| 890 | 
            +
              "licence": "license",
         | 
| 891 | 
            +
              "licenced": "licensed",
         | 
| 892 | 
            +
              "licences": "licenses",
         | 
| 893 | 
            +
              "licencing": "licensing",
         | 
| 894 | 
            +
              "likeable": "likable",
         | 
| 895 | 
            +
              "lionisation": "lionization",
         | 
| 896 | 
            +
              "lionise": "lionize",
         | 
| 897 | 
            +
              "lionised": "lionized",
         | 
| 898 | 
            +
              "lionises": "lionizes",
         | 
| 899 | 
            +
              "lionising": "lionizing",
         | 
| 900 | 
            +
              "liquidise": "liquidize",
         | 
| 901 | 
            +
              "liquidised": "liquidized",
         | 
| 902 | 
            +
              "liquidiser": "liquidizer",
         | 
| 903 | 
            +
              "liquidisers": "liquidizers",
         | 
| 904 | 
            +
              "liquidises": "liquidizes",
         | 
| 905 | 
            +
              "liquidising": "liquidizing",
         | 
| 906 | 
            +
              "litre": "liter",
         | 
| 907 | 
            +
              "litres": "liters",
         | 
| 908 | 
            +
              "localise": "localize",
         | 
| 909 | 
            +
              "localised": "localized",
         | 
| 910 | 
            +
              "localises": "localizes",
         | 
| 911 | 
            +
              "localising": "localizing",
         | 
| 912 | 
            +
              "louvre": "louver",
         | 
| 913 | 
            +
              "louvred": "louvered",
         | 
| 914 | 
            +
              "louvres": "louvers",
         | 
| 915 | 
            +
              "lustre": "luster",
         | 
| 916 | 
            +
              "magnetise": "magnetize",
         | 
| 917 | 
            +
              "magnetised": "magnetized",
         | 
| 918 | 
            +
              "magnetises": "magnetizes",
         | 
| 919 | 
            +
              "magnetising": "magnetizing",
         | 
| 920 | 
            +
              "manoeuvrability": "maneuverability",
         | 
| 921 | 
            +
              "manoeuvrable": "maneuverable",
         | 
| 922 | 
            +
              "manoeuvre": "maneuver",
         | 
| 923 | 
            +
              "manoeuvred": "maneuvered",
         | 
| 924 | 
            +
              "manoeuvres": "maneuvers",
         | 
| 925 | 
            +
              "manoeuvring": "maneuvering",
         | 
| 926 | 
            +
              "manoeuvrings": "maneuverings",
         | 
| 927 | 
            +
              "marginalisation": "marginalization",
         | 
| 928 | 
            +
              "marginalise": "marginalize",
         | 
| 929 | 
            +
              "marginalised": "marginalized",
         | 
| 930 | 
            +
              "marginalises": "marginalizes",
         | 
| 931 | 
            +
              "marginalising": "marginalizing",
         | 
| 932 | 
            +
              "marshalled": "marshaled",
         | 
| 933 | 
            +
              "marshalling": "marshaling",
         | 
| 934 | 
            +
              "marvelled": "marveled",
         | 
| 935 | 
            +
              "marvelling": "marveling",
         | 
| 936 | 
            +
              "marvellous": "marvelous",
         | 
| 937 | 
            +
              "marvellously": "marvelously",
         | 
| 938 | 
            +
              "materialisation": "materialization",
         | 
| 939 | 
            +
              "materialise": "materialize",
         | 
| 940 | 
            +
              "materialised": "materialized",
         | 
| 941 | 
            +
              "materialises": "materializes",
         | 
| 942 | 
            +
              "materialising": "materializing",
         | 
| 943 | 
            +
              "maximisation": "maximization",
         | 
| 944 | 
            +
              "maximise": "maximize",
         | 
| 945 | 
            +
              "maximised": "maximized",
         | 
| 946 | 
            +
              "maximises": "maximizes",
         | 
| 947 | 
            +
              "maximising": "maximizing",
         | 
| 948 | 
            +
              "meagre": "meager",
         | 
| 949 | 
            +
              "mechanisation": "mechanization",
         | 
| 950 | 
            +
              "mechanise": "mechanize",
         | 
| 951 | 
            +
              "mechanised": "mechanized",
         | 
| 952 | 
            +
              "mechanises": "mechanizes",
         | 
| 953 | 
            +
              "mechanising": "mechanizing",
         | 
| 954 | 
            +
              "mediaeval": "medieval",
         | 
| 955 | 
            +
              "memorialise": "memorialize",
         | 
| 956 | 
            +
              "memorialised": "memorialized",
         | 
| 957 | 
            +
              "memorialises": "memorializes",
         | 
| 958 | 
            +
              "memorialising": "memorializing",
         | 
| 959 | 
            +
              "memorise": "memorize",
         | 
| 960 | 
            +
              "memorised": "memorized",
         | 
| 961 | 
            +
              "memorises": "memorizes",
         | 
| 962 | 
            +
              "memorising": "memorizing",
         | 
| 963 | 
            +
              "mesmerise": "mesmerize",
         | 
| 964 | 
            +
              "mesmerised": "mesmerized",
         | 
| 965 | 
            +
              "mesmerises": "mesmerizes",
         | 
| 966 | 
            +
              "mesmerising": "mesmerizing",
         | 
| 967 | 
            +
              "metabolise": "metabolize",
         | 
| 968 | 
            +
              "metabolised": "metabolized",
         | 
| 969 | 
            +
              "metabolises": "metabolizes",
         | 
| 970 | 
            +
              "metabolising": "metabolizing",
         | 
| 971 | 
            +
              "metre": "meter",
         | 
| 972 | 
            +
              "metres": "meters",
         | 
| 973 | 
            +
              "mhm": "hmm",
         | 
| 974 | 
            +
              "micrometre": "micrometer",
         | 
| 975 | 
            +
              "micrometres": "micrometers",
         | 
| 976 | 
            +
              "militarise": "militarize",
         | 
| 977 | 
            +
              "militarised": "militarized",
         | 
| 978 | 
            +
              "militarises": "militarizes",
         | 
| 979 | 
            +
              "militarising": "militarizing",
         | 
| 980 | 
            +
              "milligramme": "milligram",
         | 
| 981 | 
            +
              "milligrammes": "milligrams",
         | 
| 982 | 
            +
              "millilitre": "milliliter",
         | 
| 983 | 
            +
              "millilitres": "milliliters",
         | 
| 984 | 
            +
              "millimetre": "millimeter",
         | 
| 985 | 
            +
              "millimetres": "millimeters",
         | 
| 986 | 
            +
              "miniaturisation": "miniaturization",
         | 
| 987 | 
            +
              "miniaturise": "miniaturize",
         | 
| 988 | 
            +
              "miniaturised": "miniaturized",
         | 
| 989 | 
            +
              "miniaturises": "miniaturizes",
         | 
| 990 | 
            +
              "miniaturising": "miniaturizing",
         | 
| 991 | 
            +
              "minibusses": "minibuses",
         | 
| 992 | 
            +
              "minimise": "minimize",
         | 
| 993 | 
            +
              "minimised": "minimized",
         | 
| 994 | 
            +
              "minimises": "minimizes",
         | 
| 995 | 
            +
              "minimising": "minimizing",
         | 
| 996 | 
            +
              "misbehaviour": "misbehavior",
         | 
| 997 | 
            +
              "misdemeanour": "misdemeanor",
         | 
| 998 | 
            +
              "misdemeanours": "misdemeanors",
         | 
| 999 | 
            +
              "misspelt": "misspelled",
         | 
| 1000 | 
            +
              "mitre": "miter",
         | 
| 1001 | 
            +
              "mitres": "miters",
         | 
| 1002 | 
            +
              "mm": "hmm",
         | 
| 1003 | 
            +
              "mmm": "hmm",
         | 
| 1004 | 
            +
              "mobilisation": "mobilization",
         | 
| 1005 | 
            +
              "mobilise": "mobilize",
         | 
| 1006 | 
            +
              "mobilised": "mobilized",
         | 
| 1007 | 
            +
              "mobilises": "mobilizes",
         | 
| 1008 | 
            +
              "mobilising": "mobilizing",
         | 
| 1009 | 
            +
              "modelled": "modeled",
         | 
| 1010 | 
            +
              "modeller": "modeler",
         | 
| 1011 | 
            +
              "modellers": "modelers",
         | 
| 1012 | 
            +
              "modelling": "modeling",
         | 
| 1013 | 
            +
              "modernise": "modernize",
         | 
| 1014 | 
            +
              "modernised": "modernized",
         | 
| 1015 | 
            +
              "modernises": "modernizes",
         | 
| 1016 | 
            +
              "modernising": "modernizing",
         | 
| 1017 | 
            +
              "moisturise": "moisturize",
         | 
| 1018 | 
            +
              "moisturised": "moisturized",
         | 
| 1019 | 
            +
              "moisturiser": "moisturizer",
         | 
| 1020 | 
            +
              "moisturisers": "moisturizers",
         | 
| 1021 | 
            +
              "moisturises": "moisturizes",
         | 
| 1022 | 
            +
              "moisturising": "moisturizing",
         | 
| 1023 | 
            +
              "monologue": "monolog",
         | 
| 1024 | 
            +
              "monologues": "monologs",
         | 
| 1025 | 
            +
              "monopolisation": "monopolization",
         | 
| 1026 | 
            +
              "monopolise": "monopolize",
         | 
| 1027 | 
            +
              "monopolised": "monopolized",
         | 
| 1028 | 
            +
              "monopolises": "monopolizes",
         | 
| 1029 | 
            +
              "monopolising": "monopolizing",
         | 
| 1030 | 
            +
              "moralise": "moralize",
         | 
| 1031 | 
            +
              "moralised": "moralized",
         | 
| 1032 | 
            +
              "moralises": "moralizes",
         | 
| 1033 | 
            +
              "moralising": "moralizing",
         | 
| 1034 | 
            +
              "motorised": "motorized",
         | 
| 1035 | 
            +
              "mould": "mold",
         | 
| 1036 | 
            +
              "moulded": "molded",
         | 
| 1037 | 
            +
              "moulder": "molder",
         | 
| 1038 | 
            +
              "mouldered": "moldered",
         | 
| 1039 | 
            +
              "mouldering": "moldering",
         | 
| 1040 | 
            +
              "moulders": "molders",
         | 
| 1041 | 
            +
              "mouldier": "moldier",
         | 
| 1042 | 
            +
              "mouldiest": "moldiest",
         | 
| 1043 | 
            +
              "moulding": "molding",
         | 
| 1044 | 
            +
              "mouldings": "moldings",
         | 
| 1045 | 
            +
              "moulds": "molds",
         | 
| 1046 | 
            +
              "mouldy": "moldy",
         | 
| 1047 | 
            +
              "moult": "molt",
         | 
| 1048 | 
            +
              "moulted": "molted",
         | 
| 1049 | 
            +
              "moulting": "molting",
         | 
| 1050 | 
            +
              "moults": "molts",
         | 
| 1051 | 
            +
              "moustache": "mustache",
         | 
| 1052 | 
            +
              "moustached": "mustached",
         | 
| 1053 | 
            +
              "moustaches": "mustaches",
         | 
| 1054 | 
            +
              "moustachioed": "mustachioed",
         | 
| 1055 | 
            +
              "multicoloured": "multicolored",
         | 
| 1056 | 
            +
              "nationalisation": "nationalization",
         | 
| 1057 | 
            +
              "nationalisations": "nationalizations",
         | 
| 1058 | 
            +
              "nationalise": "nationalize",
         | 
| 1059 | 
            +
              "nationalised": "nationalized",
         | 
| 1060 | 
            +
              "nationalises": "nationalizes",
         | 
| 1061 | 
            +
              "nationalising": "nationalizing",
         | 
| 1062 | 
            +
              "naturalisation": "naturalization",
         | 
| 1063 | 
            +
              "naturalise": "naturalize",
         | 
| 1064 | 
            +
              "naturalised": "naturalized",
         | 
| 1065 | 
            +
              "naturalises": "naturalizes",
         | 
| 1066 | 
            +
              "naturalising": "naturalizing",
         | 
| 1067 | 
            +
              "neighbour": "neighbor",
         | 
| 1068 | 
            +
              "neighbourhood": "neighborhood",
         | 
| 1069 | 
            +
              "neighbourhoods": "neighborhoods",
         | 
| 1070 | 
            +
              "neighbouring": "neighboring",
         | 
| 1071 | 
            +
              "neighbourliness": "neighborliness",
         | 
| 1072 | 
            +
              "neighbourly": "neighborly",
         | 
| 1073 | 
            +
              "neighbours": "neighbors",
         | 
| 1074 | 
            +
              "neutralisation": "neutralization",
         | 
| 1075 | 
            +
              "neutralise": "neutralize",
         | 
| 1076 | 
            +
              "neutralised": "neutralized",
         | 
| 1077 | 
            +
              "neutralises": "neutralizes",
         | 
| 1078 | 
            +
              "neutralising": "neutralizing",
         | 
| 1079 | 
            +
              "normalisation": "normalization",
         | 
| 1080 | 
            +
              "normalise": "normalize",
         | 
| 1081 | 
            +
              "normalised": "normalized",
         | 
| 1082 | 
            +
              "normalises": "normalizes",
         | 
| 1083 | 
            +
              "normalising": "normalizing",
         | 
| 1084 | 
            +
              "odour": "odor",
         | 
| 1085 | 
            +
              "odourless": "odorless",
         | 
| 1086 | 
            +
              "odours": "odors",
         | 
| 1087 | 
            +
              "oesophagus": "esophagus",
         | 
| 1088 | 
            +
              "oesophaguses": "esophaguses",
         | 
| 1089 | 
            +
              "oestrogen": "estrogen",
         | 
| 1090 | 
            +
              "offence": "offense",
         | 
| 1091 | 
            +
              "offences": "offenses",
         | 
| 1092 | 
            +
              "omelette": "omelet",
         | 
| 1093 | 
            +
              "omelettes": "omelets",
         | 
| 1094 | 
            +
              "optimise": "optimize",
         | 
| 1095 | 
            +
              "optimised": "optimized",
         | 
| 1096 | 
            +
              "optimises": "optimizes",
         | 
| 1097 | 
            +
              "optimising": "optimizing",
         | 
| 1098 | 
            +
              "organisation": "organization",
         | 
| 1099 | 
            +
              "organisational": "organizational",
         | 
| 1100 | 
            +
              "organisations": "organizations",
         | 
| 1101 | 
            +
              "organise": "organize",
         | 
| 1102 | 
            +
              "organised": "organized",
         | 
| 1103 | 
            +
              "organiser": "organizer",
         | 
| 1104 | 
            +
              "organisers": "organizers",
         | 
| 1105 | 
            +
              "organises": "organizes",
         | 
| 1106 | 
            +
              "organising": "organizing",
         | 
| 1107 | 
            +
              "orthopaedic": "orthopedic",
         | 
| 1108 | 
            +
              "orthopaedics": "orthopedics",
         | 
| 1109 | 
            +
              "ostracise": "ostracize",
         | 
| 1110 | 
            +
              "ostracised": "ostracized",
         | 
| 1111 | 
            +
              "ostracises": "ostracizes",
         | 
| 1112 | 
            +
              "ostracising": "ostracizing",
         | 
| 1113 | 
            +
              "outmanoeuvre": "outmaneuver",
         | 
| 1114 | 
            +
              "outmanoeuvred": "outmaneuvered",
         | 
| 1115 | 
            +
              "outmanoeuvres": "outmaneuvers",
         | 
| 1116 | 
            +
              "outmanoeuvring": "outmaneuvering",
         | 
| 1117 | 
            +
              "overemphasise": "overemphasize",
         | 
| 1118 | 
            +
              "overemphasised": "overemphasized",
         | 
| 1119 | 
            +
              "overemphasises": "overemphasizes",
         | 
| 1120 | 
            +
              "overemphasising": "overemphasizing",
         | 
| 1121 | 
            +
              "oxidisation": "oxidization",
         | 
| 1122 | 
            +
              "oxidise": "oxidize",
         | 
| 1123 | 
            +
              "oxidised": "oxidized",
         | 
| 1124 | 
            +
              "oxidises": "oxidizes",
         | 
| 1125 | 
            +
              "oxidising": "oxidizing",
         | 
| 1126 | 
            +
              "paederast": "pederast",
         | 
| 1127 | 
            +
              "paederasts": "pederasts",
         | 
| 1128 | 
            +
              "paediatric": "pediatric",
         | 
| 1129 | 
            +
              "paediatrician": "pediatrician",
         | 
| 1130 | 
            +
              "paediatricians": "pediatricians",
         | 
| 1131 | 
            +
              "paediatrics": "pediatrics",
         | 
| 1132 | 
            +
              "paedophile": "pedophile",
         | 
| 1133 | 
            +
              "paedophiles": "pedophiles",
         | 
| 1134 | 
            +
              "paedophilia": "pedophilia",
         | 
| 1135 | 
            +
              "palaeolithic": "paleolithic",
         | 
| 1136 | 
            +
              "palaeontologist": "paleontologist",
         | 
| 1137 | 
            +
              "palaeontologists": "paleontologists",
         | 
| 1138 | 
            +
              "palaeontology": "paleontology",
         | 
| 1139 | 
            +
              "panelled": "paneled",
         | 
| 1140 | 
            +
              "panelling": "paneling",
         | 
| 1141 | 
            +
              "panellist": "panelist",
         | 
| 1142 | 
            +
              "panellists": "panelists",
         | 
| 1143 | 
            +
              "paralyse": "paralyze",
         | 
| 1144 | 
            +
              "paralysed": "paralyzed",
         | 
| 1145 | 
            +
              "paralyses": "paralyzes",
         | 
| 1146 | 
            +
              "paralysing": "paralyzing",
         | 
| 1147 | 
            +
              "parcelled": "parceled",
         | 
| 1148 | 
            +
              "parcelling": "parceling",
         | 
| 1149 | 
            +
              "parlour": "parlor",
         | 
| 1150 | 
            +
              "parlours": "parlors",
         | 
| 1151 | 
            +
              "particularise": "particularize",
         | 
| 1152 | 
            +
              "particularised": "particularized",
         | 
| 1153 | 
            +
              "particularises": "particularizes",
         | 
| 1154 | 
            +
              "particularising": "particularizing",
         | 
| 1155 | 
            +
              "passivisation": "passivization",
         | 
| 1156 | 
            +
              "passivise": "passivize",
         | 
| 1157 | 
            +
              "passivised": "passivized",
         | 
| 1158 | 
            +
              "passivises": "passivizes",
         | 
| 1159 | 
            +
              "passivising": "passivizing",
         | 
| 1160 | 
            +
              "pasteurisation": "pasteurization",
         | 
| 1161 | 
            +
              "pasteurise": "pasteurize",
         | 
| 1162 | 
            +
              "pasteurised": "pasteurized",
         | 
| 1163 | 
            +
              "pasteurises": "pasteurizes",
         | 
| 1164 | 
            +
              "pasteurising": "pasteurizing",
         | 
| 1165 | 
            +
              "patronise": "patronize",
         | 
| 1166 | 
            +
              "patronised": "patronized",
         | 
| 1167 | 
            +
              "patronises": "patronizes",
         | 
| 1168 | 
            +
              "patronising": "patronizing",
         | 
| 1169 | 
            +
              "patronisingly": "patronizingly",
         | 
| 1170 | 
            +
              "pedalled": "pedaled",
         | 
| 1171 | 
            +
              "pedalling": "pedaling",
         | 
| 1172 | 
            +
              "pedestrianisation": "pedestrianization",
         | 
| 1173 | 
            +
              "pedestrianise": "pedestrianize",
         | 
| 1174 | 
            +
              "pedestrianised": "pedestrianized",
         | 
| 1175 | 
            +
              "pedestrianises": "pedestrianizes",
         | 
| 1176 | 
            +
              "pedestrianising": "pedestrianizing",
         | 
| 1177 | 
            +
              "penalise": "penalize",
         | 
| 1178 | 
            +
              "penalised": "penalized",
         | 
| 1179 | 
            +
              "penalises": "penalizes",
         | 
| 1180 | 
            +
              "penalising": "penalizing",
         | 
| 1181 | 
            +
              "pencilled": "penciled",
         | 
| 1182 | 
            +
              "pencilling": "penciling",
         | 
| 1183 | 
            +
              "personalise": "personalize",
         | 
| 1184 | 
            +
              "personalised": "personalized",
         | 
| 1185 | 
            +
              "personalises": "personalizes",
         | 
| 1186 | 
            +
              "personalising": "personalizing",
         | 
| 1187 | 
            +
              "pharmacopoeia": "pharmacopeia",
         | 
| 1188 | 
            +
              "pharmacopoeias": "pharmacopeias",
         | 
| 1189 | 
            +
              "philosophise": "philosophize",
         | 
| 1190 | 
            +
              "philosophised": "philosophized",
         | 
| 1191 | 
            +
              "philosophises": "philosophizes",
         | 
| 1192 | 
            +
              "philosophising": "philosophizing",
         | 
| 1193 | 
            +
              "philtre": "filter",
         | 
| 1194 | 
            +
              "philtres": "filters",
         | 
| 1195 | 
            +
              "phoney": "phony",
         | 
| 1196 | 
            +
              "plagiarise": "plagiarize",
         | 
| 1197 | 
            +
              "plagiarised": "plagiarized",
         | 
| 1198 | 
            +
              "plagiarises": "plagiarizes",
         | 
| 1199 | 
            +
              "plagiarising": "plagiarizing",
         | 
| 1200 | 
            +
              "plough": "plow",
         | 
| 1201 | 
            +
              "ploughed": "plowed",
         | 
| 1202 | 
            +
              "ploughing": "plowing",
         | 
| 1203 | 
            +
              "ploughman": "plowman",
         | 
| 1204 | 
            +
              "ploughmen": "plowmen",
         | 
| 1205 | 
            +
              "ploughs": "plows",
         | 
| 1206 | 
            +
              "ploughshare": "plowshare",
         | 
| 1207 | 
            +
              "ploughshares": "plowshares",
         | 
| 1208 | 
            +
              "polarisation": "polarization",
         | 
| 1209 | 
            +
              "polarise": "polarize",
         | 
| 1210 | 
            +
              "polarised": "polarized",
         | 
| 1211 | 
            +
              "polarises": "polarizes",
         | 
| 1212 | 
            +
              "polarising": "polarizing",
         | 
| 1213 | 
            +
              "politicisation": "politicization",
         | 
| 1214 | 
            +
              "politicise": "politicize",
         | 
| 1215 | 
            +
              "politicised": "politicized",
         | 
| 1216 | 
            +
              "politicises": "politicizes",
         | 
| 1217 | 
            +
              "politicising": "politicizing",
         | 
| 1218 | 
            +
              "popularisation": "popularization",
         | 
| 1219 | 
            +
              "popularise": "popularize",
         | 
| 1220 | 
            +
              "popularised": "popularized",
         | 
| 1221 | 
            +
              "popularises": "popularizes",
         | 
| 1222 | 
            +
              "popularising": "popularizing",
         | 
| 1223 | 
            +
              "pouffe": "pouf",
         | 
| 1224 | 
            +
              "pouffes": "poufs",
         | 
| 1225 | 
            +
              "practise": "practice",
         | 
| 1226 | 
            +
              "practised": "practiced",
         | 
| 1227 | 
            +
              "practises": "practices",
         | 
| 1228 | 
            +
              "practising": "practicing",
         | 
| 1229 | 
            +
              "praesidium": "presidium",
         | 
| 1230 | 
            +
              "praesidiums": "presidiums",
         | 
| 1231 | 
            +
              "pressurisation": "pressurization",
         | 
| 1232 | 
            +
              "pressurise": "pressurize",
         | 
| 1233 | 
            +
              "pressurised": "pressurized",
         | 
| 1234 | 
            +
              "pressurises": "pressurizes",
         | 
| 1235 | 
            +
              "pressurising": "pressurizing",
         | 
| 1236 | 
            +
              "pretence": "pretense",
         | 
| 1237 | 
            +
              "pretences": "pretenses",
         | 
| 1238 | 
            +
              "primaeval": "primeval",
         | 
| 1239 | 
            +
              "prioritisation": "prioritization",
         | 
| 1240 | 
            +
              "prioritise": "prioritize",
         | 
| 1241 | 
            +
              "prioritised": "prioritized",
         | 
| 1242 | 
            +
              "prioritises": "prioritizes",
         | 
| 1243 | 
            +
              "prioritising": "prioritizing",
         | 
| 1244 | 
            +
              "privatisation": "privatization",
         | 
| 1245 | 
            +
              "privatisations": "privatizations",
         | 
| 1246 | 
            +
              "privatise": "privatize",
         | 
| 1247 | 
            +
              "privatised": "privatized",
         | 
| 1248 | 
            +
              "privatises": "privatizes",
         | 
| 1249 | 
            +
              "privatising": "privatizing",
         | 
| 1250 | 
            +
              "professionalisation": "professionalization",
         | 
| 1251 | 
            +
              "professionalise": "professionalize",
         | 
| 1252 | 
            +
              "professionalised": "professionalized",
         | 
| 1253 | 
            +
              "professionalises": "professionalizes",
         | 
| 1254 | 
            +
              "professionalising": "professionalizing",
         | 
| 1255 | 
            +
              "programme": "program",
         | 
| 1256 | 
            +
              "programmes": "programs",
         | 
| 1257 | 
            +
              "prologue": "prolog",
         | 
| 1258 | 
            +
              "prologues": "prologs",
         | 
| 1259 | 
            +
              "propagandise": "propagandize",
         | 
| 1260 | 
            +
              "propagandised": "propagandized",
         | 
| 1261 | 
            +
              "propagandises": "propagandizes",
         | 
| 1262 | 
            +
              "propagandising": "propagandizing",
         | 
| 1263 | 
            +
              "proselytise": "proselytize",
         | 
| 1264 | 
            +
              "proselytised": "proselytized",
         | 
| 1265 | 
            +
              "proselytiser": "proselytizer",
         | 
| 1266 | 
            +
              "proselytisers": "proselytizers",
         | 
| 1267 | 
            +
              "proselytises": "proselytizes",
         | 
| 1268 | 
            +
              "proselytising": "proselytizing",
         | 
| 1269 | 
            +
              "psychoanalyse": "psychoanalyze",
         | 
| 1270 | 
            +
              "psychoanalysed": "psychoanalyzed",
         | 
| 1271 | 
            +
              "psychoanalyses": "psychoanalyzes",
         | 
| 1272 | 
            +
              "psychoanalysing": "psychoanalyzing",
         | 
| 1273 | 
            +
              "publicise": "publicize",
         | 
| 1274 | 
            +
              "publicised": "publicized",
         | 
| 1275 | 
            +
              "publicises": "publicizes",
         | 
| 1276 | 
            +
              "publicising": "publicizing",
         | 
| 1277 | 
            +
              "pulverisation": "pulverization",
         | 
| 1278 | 
            +
              "pulverise": "pulverize",
         | 
| 1279 | 
            +
              "pulverised": "pulverized",
         | 
| 1280 | 
            +
              "pulverises": "pulverizes",
         | 
| 1281 | 
            +
              "pulverising": "pulverizing",
         | 
| 1282 | 
            +
              "pummelled": "pummel",
         | 
| 1283 | 
            +
              "pummelling": "pummeled",
         | 
| 1284 | 
            +
              "pyjama": "pajama",
         | 
| 1285 | 
            +
              "pyjamas": "pajamas",
         | 
| 1286 | 
            +
              "pzazz": "pizzazz",
         | 
| 1287 | 
            +
              "quarrelled": "quarreled",
         | 
| 1288 | 
            +
              "quarrelling": "quarreling",
         | 
| 1289 | 
            +
              "radicalise": "radicalize",
         | 
| 1290 | 
            +
              "radicalised": "radicalized",
         | 
| 1291 | 
            +
              "radicalises": "radicalizes",
         | 
| 1292 | 
            +
              "radicalising": "radicalizing",
         | 
| 1293 | 
            +
              "rancour": "rancor",
         | 
| 1294 | 
            +
              "randomise": "randomize",
         | 
| 1295 | 
            +
              "randomised": "randomized",
         | 
| 1296 | 
            +
              "randomises": "randomizes",
         | 
| 1297 | 
            +
              "randomising": "randomizing",
         | 
| 1298 | 
            +
              "rationalisation": "rationalization",
         | 
| 1299 | 
            +
              "rationalisations": "rationalizations",
         | 
| 1300 | 
            +
              "rationalise": "rationalize",
         | 
| 1301 | 
            +
              "rationalised": "rationalized",
         | 
| 1302 | 
            +
              "rationalises": "rationalizes",
         | 
| 1303 | 
            +
              "rationalising": "rationalizing",
         | 
| 1304 | 
            +
              "ravelled": "raveled",
         | 
| 1305 | 
            +
              "ravelling": "raveling",
         | 
| 1306 | 
            +
              "realisable": "realizable",
         | 
| 1307 | 
            +
              "realisation": "realization",
         | 
| 1308 | 
            +
              "realisations": "realizations",
         | 
| 1309 | 
            +
              "realise": "realize",
         | 
| 1310 | 
            +
              "realised": "realized",
         | 
| 1311 | 
            +
              "realises": "realizes",
         | 
| 1312 | 
            +
              "realising": "realizing",
         | 
| 1313 | 
            +
              "recognisable": "recognizable",
         | 
| 1314 | 
            +
              "recognisably": "recognizably",
         | 
| 1315 | 
            +
              "recognisance": "recognizance",
         | 
| 1316 | 
            +
              "recognise": "recognize",
         | 
| 1317 | 
            +
              "recognised": "recognized",
         | 
| 1318 | 
            +
              "recognises": "recognizes",
         | 
| 1319 | 
            +
              "recognising": "recognizing",
         | 
| 1320 | 
            +
              "reconnoitre": "reconnoiter",
         | 
| 1321 | 
            +
              "reconnoitred": "reconnoitered",
         | 
| 1322 | 
            +
              "reconnoitres": "reconnoiters",
         | 
| 1323 | 
            +
              "reconnoitring": "reconnoitering",
         | 
| 1324 | 
            +
              "refuelled": "refueled",
         | 
| 1325 | 
            +
              "refuelling": "refueling",
         | 
| 1326 | 
            +
              "regularisation": "regularization",
         | 
| 1327 | 
            +
              "regularise": "regularize",
         | 
| 1328 | 
            +
              "regularised": "regularized",
         | 
| 1329 | 
            +
              "regularises": "regularizes",
         | 
| 1330 | 
            +
              "regularising": "regularizing",
         | 
| 1331 | 
            +
              "remodelled": "remodeled",
         | 
| 1332 | 
            +
              "remodelling": "remodeling",
         | 
| 1333 | 
            +
              "remould": "remold",
         | 
| 1334 | 
            +
              "remoulded": "remolded",
         | 
| 1335 | 
            +
              "remoulding": "remolding",
         | 
| 1336 | 
            +
              "remoulds": "remolds",
         | 
| 1337 | 
            +
              "reorganisation": "reorganization",
         | 
| 1338 | 
            +
              "reorganisations": "reorganizations",
         | 
| 1339 | 
            +
              "reorganise": "reorganize",
         | 
| 1340 | 
            +
              "reorganised": "reorganized",
         | 
| 1341 | 
            +
              "reorganises": "reorganizes",
         | 
| 1342 | 
            +
              "reorganising": "reorganizing",
         | 
| 1343 | 
            +
              "revelled": "reveled",
         | 
| 1344 | 
            +
              "reveller": "reveler",
         | 
| 1345 | 
            +
              "revellers": "revelers",
         | 
| 1346 | 
            +
              "revelling": "reveling",
         | 
| 1347 | 
            +
              "revitalise": "revitalize",
         | 
| 1348 | 
            +
              "revitalised": "revitalized",
         | 
| 1349 | 
            +
              "revitalises": "revitalizes",
         | 
| 1350 | 
            +
              "revitalising": "revitalizing",
         | 
| 1351 | 
            +
              "revolutionise": "revolutionize",
         | 
| 1352 | 
            +
              "revolutionised": "revolutionized",
         | 
| 1353 | 
            +
              "revolutionises": "revolutionizes",
         | 
| 1354 | 
            +
              "revolutionising": "revolutionizing",
         | 
| 1355 | 
            +
              "rhapsodise": "rhapsodize",
         | 
| 1356 | 
            +
              "rhapsodised": "rhapsodized",
         | 
| 1357 | 
            +
              "rhapsodises": "rhapsodizes",
         | 
| 1358 | 
            +
              "rhapsodising": "rhapsodizing",
         | 
| 1359 | 
            +
              "rigour": "rigor",
         | 
| 1360 | 
            +
              "rigours": "rigors",
         | 
| 1361 | 
            +
              "ritualised": "ritualized",
         | 
| 1362 | 
            +
              "rivalled": "rivaled",
         | 
| 1363 | 
            +
              "rivalling": "rivaling",
         | 
| 1364 | 
            +
              "romanticise": "romanticize",
         | 
| 1365 | 
            +
              "romanticised": "romanticized",
         | 
| 1366 | 
            +
              "romanticises": "romanticizes",
         | 
| 1367 | 
            +
              "romanticising": "romanticizing",
         | 
| 1368 | 
            +
              "rumour": "rumor",
         | 
| 1369 | 
            +
              "rumoured": "rumored",
         | 
| 1370 | 
            +
              "rumours": "rumors",
         | 
| 1371 | 
            +
              "sabre": "saber",
         | 
| 1372 | 
            +
              "sabres": "sabers",
         | 
| 1373 | 
            +
              "saltpetre": "saltpeter",
         | 
| 1374 | 
            +
              "sanitise": "sanitize",
         | 
| 1375 | 
            +
              "sanitised": "sanitized",
         | 
| 1376 | 
            +
              "sanitises": "sanitizes",
         | 
| 1377 | 
            +
              "sanitising": "sanitizing",
         | 
| 1378 | 
            +
              "satirise": "satirize",
         | 
| 1379 | 
            +
              "satirised": "satirized",
         | 
| 1380 | 
            +
              "satirises": "satirizes",
         | 
| 1381 | 
            +
              "satirising": "satirizing",
         | 
| 1382 | 
            +
              "saviour": "savior",
         | 
| 1383 | 
            +
              "saviours": "saviors",
         | 
| 1384 | 
            +
              "savour": "savor",
         | 
| 1385 | 
            +
              "savoured": "savored",
         | 
| 1386 | 
            +
              "savouries": "savories",
         | 
| 1387 | 
            +
              "savouring": "savoring",
         | 
| 1388 | 
            +
              "savours": "savors",
         | 
| 1389 | 
            +
              "savoury": "savory",
         | 
| 1390 | 
            +
              "scandalise": "scandalize",
         | 
| 1391 | 
            +
              "scandalised": "scandalized",
         | 
| 1392 | 
            +
              "scandalises": "scandalizes",
         | 
| 1393 | 
            +
              "scandalising": "scandalizing",
         | 
| 1394 | 
            +
              "sceptic": "skeptic",
         | 
| 1395 | 
            +
              "sceptical": "skeptical",
         | 
| 1396 | 
            +
              "sceptically": "skeptically",
         | 
| 1397 | 
            +
              "scepticism": "skepticism",
         | 
| 1398 | 
            +
              "sceptics": "skeptics",
         | 
| 1399 | 
            +
              "sceptre": "scepter",
         | 
| 1400 | 
            +
              "sceptres": "scepters",
         | 
| 1401 | 
            +
              "scrutinise": "scrutinize",
         | 
| 1402 | 
            +
              "scrutinised": "scrutinized",
         | 
| 1403 | 
            +
              "scrutinises": "scrutinizes",
         | 
| 1404 | 
            +
              "scrutinising": "scrutinizing",
         | 
| 1405 | 
            +
              "secularisation": "secularization",
         | 
| 1406 | 
            +
              "secularise": "secularize",
         | 
| 1407 | 
            +
              "secularised": "secularized",
         | 
| 1408 | 
            +
              "secularises": "secularizes",
         | 
| 1409 | 
            +
              "secularising": "secularizing",
         | 
| 1410 | 
            +
              "sensationalise": "sensationalize",
         | 
| 1411 | 
            +
              "sensationalised": "sensationalized",
         | 
| 1412 | 
            +
              "sensationalises": "sensationalizes",
         | 
| 1413 | 
            +
              "sensationalising": "sensationalizing",
         | 
| 1414 | 
            +
              "sensitise": "sensitize",
         | 
| 1415 | 
            +
              "sensitised": "sensitized",
         | 
| 1416 | 
            +
              "sensitises": "sensitizes",
         | 
| 1417 | 
            +
              "sensitising": "sensitizing",
         | 
| 1418 | 
            +
              "sentimentalise": "sentimentalize",
         | 
| 1419 | 
            +
              "sentimentalised": "sentimentalized",
         | 
| 1420 | 
            +
              "sentimentalises": "sentimentalizes",
         | 
| 1421 | 
            +
              "sentimentalising": "sentimentalizing",
         | 
| 1422 | 
            +
              "sepulchre": "sepulcher",
         | 
| 1423 | 
            +
              "sepulchres": "sepulchers",
         | 
| 1424 | 
            +
              "serialisation": "serialization",
         | 
| 1425 | 
            +
              "serialisations": "serializations",
         | 
| 1426 | 
            +
              "serialise": "serialize",
         | 
| 1427 | 
            +
              "serialised": "serialized",
         | 
| 1428 | 
            +
              "serialises": "serializes",
         | 
| 1429 | 
            +
              "serialising": "serializing",
         | 
| 1430 | 
            +
              "sermonise": "sermonize",
         | 
| 1431 | 
            +
              "sermonised": "sermonized",
         | 
| 1432 | 
            +
              "sermonises": "sermonizes",
         | 
| 1433 | 
            +
              "sermonising": "sermonizing",
         | 
| 1434 | 
            +
              "sheikh": "sheik",
         | 
| 1435 | 
            +
              "shovelled": "shoveled",
         | 
| 1436 | 
            +
              "shovelling": "shoveling",
         | 
| 1437 | 
            +
              "shrivelled": "shriveled",
         | 
| 1438 | 
            +
              "shrivelling": "shriveling",
         | 
| 1439 | 
            +
              "signalise": "signalize",
         | 
| 1440 | 
            +
              "signalised": "signalized",
         | 
| 1441 | 
            +
              "signalises": "signalizes",
         | 
| 1442 | 
            +
              "signalising": "signalizing",
         | 
| 1443 | 
            +
              "signalled": "signaled",
         | 
| 1444 | 
            +
              "signalling": "signaling",
         | 
| 1445 | 
            +
              "smoulder": "smolder",
         | 
| 1446 | 
            +
              "smouldered": "smoldered",
         | 
| 1447 | 
            +
              "smouldering": "smoldering",
         | 
| 1448 | 
            +
              "smoulders": "smolders",
         | 
| 1449 | 
            +
              "snivelled": "sniveled",
         | 
| 1450 | 
            +
              "snivelling": "sniveling",
         | 
| 1451 | 
            +
              "snorkelled": "snorkeled",
         | 
| 1452 | 
            +
              "snorkelling": "snorkeling",
         | 
| 1453 | 
            +
              "snowplough": "snowplow",
         | 
| 1454 | 
            +
              "snowploughs": "snowplow",
         | 
| 1455 | 
            +
              "socialisation": "socialization",
         | 
| 1456 | 
            +
              "socialise": "socialize",
         | 
| 1457 | 
            +
              "socialised": "socialized",
         | 
| 1458 | 
            +
              "socialises": "socializes",
         | 
| 1459 | 
            +
              "socialising": "socializing",
         | 
| 1460 | 
            +
              "sodomise": "sodomize",
         | 
| 1461 | 
            +
              "sodomised": "sodomized",
         | 
| 1462 | 
            +
              "sodomises": "sodomizes",
         | 
| 1463 | 
            +
              "sodomising": "sodomizing",
         | 
| 1464 | 
            +
              "solemnise": "solemnize",
         | 
| 1465 | 
            +
              "solemnised": "solemnized",
         | 
| 1466 | 
            +
              "solemnises": "solemnizes",
         | 
| 1467 | 
            +
              "solemnising": "solemnizing",
         | 
| 1468 | 
            +
              "sombre": "somber",
         | 
| 1469 | 
            +
              "specialisation": "specialization",
         | 
| 1470 | 
            +
              "specialisations": "specializations",
         | 
| 1471 | 
            +
              "specialise": "specialize",
         | 
| 1472 | 
            +
              "specialised": "specialized",
         | 
| 1473 | 
            +
              "specialises": "specializes",
         | 
| 1474 | 
            +
              "specialising": "specializing",
         | 
| 1475 | 
            +
              "spectre": "specter",
         | 
| 1476 | 
            +
              "spectres": "specters",
         | 
| 1477 | 
            +
              "spiralled": "spiraled",
         | 
| 1478 | 
            +
              "spiralling": "spiraling",
         | 
| 1479 | 
            +
              "splendour": "splendor",
         | 
| 1480 | 
            +
              "splendours": "splendors",
         | 
| 1481 | 
            +
              "squirrelled": "squirreled",
         | 
| 1482 | 
            +
              "squirrelling": "squirreling",
         | 
| 1483 | 
            +
              "stabilisation": "stabilization",
         | 
| 1484 | 
            +
              "stabilise": "stabilize",
         | 
| 1485 | 
            +
              "stabilised": "stabilized",
         | 
| 1486 | 
            +
              "stabiliser": "stabilizer",
         | 
| 1487 | 
            +
              "stabilisers": "stabilizers",
         | 
| 1488 | 
            +
              "stabilises": "stabilizes",
         | 
| 1489 | 
            +
              "stabilising": "stabilizing",
         | 
| 1490 | 
            +
              "standardisation": "standardization",
         | 
| 1491 | 
            +
              "standardise": "standardize",
         | 
| 1492 | 
            +
              "standardised": "standardized",
         | 
| 1493 | 
            +
              "standardises": "standardizes",
         | 
| 1494 | 
            +
              "standardising": "standardizing",
         | 
| 1495 | 
            +
              "stencilled": "stenciled",
         | 
| 1496 | 
            +
              "stencilling": "stenciling",
         | 
| 1497 | 
            +
              "sterilisation": "sterilization",
         | 
| 1498 | 
            +
              "sterilisations": "sterilizations",
         | 
| 1499 | 
            +
              "sterilise": "sterilize",
         | 
| 1500 | 
            +
              "sterilised": "sterilized",
         | 
| 1501 | 
            +
              "steriliser": "sterilizer",
         | 
| 1502 | 
            +
              "sterilisers": "sterilizers",
         | 
| 1503 | 
            +
              "sterilises": "sterilizes",
         | 
| 1504 | 
            +
              "sterilising": "sterilizing",
         | 
| 1505 | 
            +
              "stigmatisation": "stigmatization",
         | 
| 1506 | 
            +
              "stigmatise": "stigmatize",
         | 
| 1507 | 
            +
              "stigmatised": "stigmatized",
         | 
| 1508 | 
            +
              "stigmatises": "stigmatizes",
         | 
| 1509 | 
            +
              "stigmatising": "stigmatizing",
         | 
| 1510 | 
            +
              "storey": "story",
         | 
| 1511 | 
            +
              "storeys": "stories",
         | 
| 1512 | 
            +
              "subsidisation": "subsidization",
         | 
| 1513 | 
            +
              "subsidise": "subsidize",
         | 
| 1514 | 
            +
              "subsidised": "subsidized",
         | 
| 1515 | 
            +
              "subsidiser": "subsidizer",
         | 
| 1516 | 
            +
              "subsidisers": "subsidizers",
         | 
| 1517 | 
            +
              "subsidises": "subsidizes",
         | 
| 1518 | 
            +
              "subsidising": "subsidizing",
         | 
| 1519 | 
            +
              "succour": "succor",
         | 
| 1520 | 
            +
              "succoured": "succored",
         | 
| 1521 | 
            +
              "succouring": "succoring",
         | 
| 1522 | 
            +
              "succours": "succors",
         | 
| 1523 | 
            +
              "sulphate": "sulfate",
         | 
| 1524 | 
            +
              "sulphates": "sulfates",
         | 
| 1525 | 
            +
              "sulphide": "sulfide",
         | 
| 1526 | 
            +
              "sulphides": "sulfides",
         | 
| 1527 | 
            +
              "sulphur": "sulfur",
         | 
| 1528 | 
            +
              "sulphurous": "sulfurous",
         | 
| 1529 | 
            +
              "summarise": "summarize",
         | 
| 1530 | 
            +
              "summarised": "summarized",
         | 
| 1531 | 
            +
              "summarises": "summarizes",
         | 
| 1532 | 
            +
              "summarising": "summarizing",
         | 
| 1533 | 
            +
              "swivelled": "swiveled",
         | 
| 1534 | 
            +
              "swivelling": "swiveling",
         | 
| 1535 | 
            +
              "symbolise": "symbolize",
         | 
| 1536 | 
            +
              "symbolised": "symbolized",
         | 
| 1537 | 
            +
              "symbolises": "symbolizes",
         | 
| 1538 | 
            +
              "symbolising": "symbolizing",
         | 
| 1539 | 
            +
              "sympathise": "sympathize",
         | 
| 1540 | 
            +
              "sympathised": "sympathized",
         | 
| 1541 | 
            +
              "sympathiser": "sympathizer",
         | 
| 1542 | 
            +
              "sympathisers": "sympathizers",
         | 
| 1543 | 
            +
              "sympathises": "sympathizes",
         | 
| 1544 | 
            +
              "sympathising": "sympathizing",
         | 
| 1545 | 
            +
              "synchronisation": "synchronization",
         | 
| 1546 | 
            +
              "synchronise": "synchronize",
         | 
| 1547 | 
            +
              "synchronised": "synchronized",
         | 
| 1548 | 
            +
              "synchronises": "synchronizes",
         | 
| 1549 | 
            +
              "synchronising": "synchronizing",
         | 
| 1550 | 
            +
              "synthesise": "synthesize",
         | 
| 1551 | 
            +
              "synthesised": "synthesized",
         | 
| 1552 | 
            +
              "synthesiser": "synthesizer",
         | 
| 1553 | 
            +
              "synthesisers": "synthesizers",
         | 
| 1554 | 
            +
              "synthesises": "synthesizes",
         | 
| 1555 | 
            +
              "synthesising": "synthesizing",
         | 
| 1556 | 
            +
              "syphon": "siphon",
         | 
| 1557 | 
            +
              "syphoned": "siphoned",
         | 
| 1558 | 
            +
              "syphoning": "siphoning",
         | 
| 1559 | 
            +
              "syphons": "siphons",
         | 
| 1560 | 
            +
              "systematisation": "systematization",
         | 
| 1561 | 
            +
              "systematise": "systematize",
         | 
| 1562 | 
            +
              "systematised": "systematized",
         | 
| 1563 | 
            +
              "systematises": "systematizes",
         | 
| 1564 | 
            +
              "systematising": "systematizing",
         | 
| 1565 | 
            +
              "tantalise": "tantalize",
         | 
| 1566 | 
            +
              "tantalised": "tantalized",
         | 
| 1567 | 
            +
              "tantalises": "tantalizes",
         | 
| 1568 | 
            +
              "tantalising": "tantalizing",
         | 
| 1569 | 
            +
              "tantalisingly": "tantalizingly",
         | 
| 1570 | 
            +
              "tasselled": "tasseled",
         | 
| 1571 | 
            +
              "technicolour": "technicolor",
         | 
| 1572 | 
            +
              "temporise": "temporize",
         | 
| 1573 | 
            +
              "temporised": "temporized",
         | 
| 1574 | 
            +
              "temporises": "temporizes",
         | 
| 1575 | 
            +
              "temporising": "temporizing",
         | 
| 1576 | 
            +
              "tenderise": "tenderize",
         | 
| 1577 | 
            +
              "tenderised": "tenderized",
         | 
| 1578 | 
            +
              "tenderises": "tenderizes",
         | 
| 1579 | 
            +
              "tenderising": "tenderizing",
         | 
| 1580 | 
            +
              "terrorise": "terrorize",
         | 
| 1581 | 
            +
              "terrorised": "terrorized",
         | 
| 1582 | 
            +
              "terrorises": "terrorizes",
         | 
| 1583 | 
            +
              "terrorising": "terrorizing",
         | 
| 1584 | 
            +
              "theatre": "theater",
         | 
| 1585 | 
            +
              "theatregoer": "theatergoer",
         | 
| 1586 | 
            +
              "theatregoers": "theatergoers",
         | 
| 1587 | 
            +
              "theatres": "theaters",
         | 
| 1588 | 
            +
              "theorise": "theorize",
         | 
| 1589 | 
            +
              "theorised": "theorized",
         | 
| 1590 | 
            +
              "theorises": "theorizes",
         | 
| 1591 | 
            +
              "theorising": "theorizing",
         | 
| 1592 | 
            +
              "tonne": "ton",
         | 
| 1593 | 
            +
              "tonnes": "tons",
         | 
| 1594 | 
            +
              "towelled": "toweled",
         | 
| 1595 | 
            +
              "towelling": "toweling",
         | 
| 1596 | 
            +
              "toxaemia": "toxemia",
         | 
| 1597 | 
            +
              "tranquillise": "tranquilize",
         | 
| 1598 | 
            +
              "tranquillised": "tranquilized",
         | 
| 1599 | 
            +
              "tranquilliser": "tranquilizer",
         | 
| 1600 | 
            +
              "tranquillisers": "tranquilizers",
         | 
| 1601 | 
            +
              "tranquillises": "tranquilizes",
         | 
| 1602 | 
            +
              "tranquillising": "tranquilizing",
         | 
| 1603 | 
            +
              "tranquillity": "tranquility",
         | 
| 1604 | 
            +
              "tranquillize": "tranquilize",
         | 
| 1605 | 
            +
              "tranquillized": "tranquilized",
         | 
| 1606 | 
            +
              "tranquillizer": "tranquilizer",
         | 
| 1607 | 
            +
              "tranquillizers": "tranquilizers",
         | 
| 1608 | 
            +
              "tranquillizes": "tranquilizes",
         | 
| 1609 | 
            +
              "tranquillizing": "tranquilizing",
         | 
| 1610 | 
            +
              "tranquilly": "tranquility",
         | 
| 1611 | 
            +
              "transistorised": "transistorized",
         | 
| 1612 | 
            +
              "traumatise": "traumatize",
         | 
| 1613 | 
            +
              "traumatised": "traumatized",
         | 
| 1614 | 
            +
              "traumatises": "traumatizes",
         | 
| 1615 | 
            +
              "traumatising": "traumatizing",
         | 
| 1616 | 
            +
              "travelled": "traveled",
         | 
| 1617 | 
            +
              "traveller": "traveler",
         | 
| 1618 | 
            +
              "travellers": "travelers",
         | 
| 1619 | 
            +
              "travelling": "traveling",
         | 
| 1620 | 
            +
              "travelog": "travelogue",
         | 
| 1621 | 
            +
              "travelogs": "travelogues",
         | 
| 1622 | 
            +
              "trialled": "trialed",
         | 
| 1623 | 
            +
              "trialling": "trialing",
         | 
| 1624 | 
            +
              "tricolour": "tricolor",
         | 
| 1625 | 
            +
              "tricolours": "tricolors",
         | 
| 1626 | 
            +
              "trivialise": "trivialize",
         | 
| 1627 | 
            +
              "trivialised": "trivialized",
         | 
| 1628 | 
            +
              "trivialises": "trivializes",
         | 
| 1629 | 
            +
              "trivialising": "trivializing",
         | 
| 1630 | 
            +
              "tumour": "tumor",
         | 
| 1631 | 
            +
              "tumours": "tumors",
         | 
| 1632 | 
            +
              "tunnelled": "tunneled",
         | 
| 1633 | 
            +
              "tunnelling": "tunneling",
         | 
| 1634 | 
            +
              "tyrannise": "tyrannize",
         | 
| 1635 | 
            +
              "tyrannised": "tyrannized",
         | 
| 1636 | 
            +
              "tyrannises": "tyrannizes",
         | 
| 1637 | 
            +
              "tyrannising": "tyrannizing",
         | 
| 1638 | 
            +
              "tyre": "tire",
         | 
| 1639 | 
            +
              "tyres": "tires",
         | 
| 1640 | 
            +
              "unauthorised": "unauthorized",
         | 
| 1641 | 
            +
              "uncivilised": "uncivilized",
         | 
| 1642 | 
            +
              "underutilised": "underutilized",
         | 
| 1643 | 
            +
              "unequalled": "unequaled",
         | 
| 1644 | 
            +
              "unfavourable": "unfavorable",
         | 
| 1645 | 
            +
              "unfavourably": "unfavorably",
         | 
| 1646 | 
            +
              "unionisation": "unionization",
         | 
| 1647 | 
            +
              "unionise": "unionize",
         | 
| 1648 | 
            +
              "unionised": "unionized",
         | 
| 1649 | 
            +
              "unionises": "unionizes",
         | 
| 1650 | 
            +
              "unionising": "unionizing",
         | 
| 1651 | 
            +
              "unorganised": "unorganized",
         | 
| 1652 | 
            +
              "unravelled": "unraveled",
         | 
| 1653 | 
            +
              "unravelling": "unraveling",
         | 
| 1654 | 
            +
              "unrecognisable": "unrecognizable",
         | 
| 1655 | 
            +
              "unrecognised": "unrecognized",
         | 
| 1656 | 
            +
              "unrivalled": "unrivaled",
         | 
| 1657 | 
            +
              "unsavoury": "unsavory",
         | 
| 1658 | 
            +
              "untrammelled": "untrammeled",
         | 
| 1659 | 
            +
              "urbanisation": "urbanization",
         | 
| 1660 | 
            +
              "urbanise": "urbanize",
         | 
| 1661 | 
            +
              "urbanised": "urbanized",
         | 
| 1662 | 
            +
              "urbanises": "urbanizes",
         | 
| 1663 | 
            +
              "urbanising": "urbanizing",
         | 
| 1664 | 
            +
              "utilisable": "utilizable",
         | 
| 1665 | 
            +
              "utilisation": "utilization",
         | 
| 1666 | 
            +
              "utilise": "utilize",
         | 
| 1667 | 
            +
              "utilised": "utilized",
         | 
| 1668 | 
            +
              "utilises": "utilizes",
         | 
| 1669 | 
            +
              "utilising": "utilizing",
         | 
| 1670 | 
            +
              "valour": "valor",
         | 
| 1671 | 
            +
              "vandalise": "vandalize",
         | 
| 1672 | 
            +
              "vandalised": "vandalized",
         | 
| 1673 | 
            +
              "vandalises": "vandalizes",
         | 
| 1674 | 
            +
              "vandalising": "vandalizing",
         | 
| 1675 | 
            +
              "vaporisation": "vaporization",
         | 
| 1676 | 
            +
              "vaporise": "vaporize",
         | 
| 1677 | 
            +
              "vaporised": "vaporized",
         | 
| 1678 | 
            +
              "vaporises": "vaporizes",
         | 
| 1679 | 
            +
              "vaporising": "vaporizing",
         | 
| 1680 | 
            +
              "vapour": "vapor",
         | 
| 1681 | 
            +
              "vapours": "vapors",
         | 
| 1682 | 
            +
              "verbalise": "verbalize",
         | 
| 1683 | 
            +
              "verbalised": "verbalized",
         | 
| 1684 | 
            +
              "verbalises": "verbalizes",
         | 
| 1685 | 
            +
              "verbalising": "verbalizing",
         | 
| 1686 | 
            +
              "victimisation": "victimization",
         | 
| 1687 | 
            +
              "victimise": "victimize",
         | 
| 1688 | 
            +
              "victimised": "victimized",
         | 
| 1689 | 
            +
              "victimises": "victimizes",
         | 
| 1690 | 
            +
              "victimising": "victimizing",
         | 
| 1691 | 
            +
              "videodisc": "videodisk",
         | 
| 1692 | 
            +
              "videodiscs": "videodisks",
         | 
| 1693 | 
            +
              "vigour": "vigor",
         | 
| 1694 | 
            +
              "visualisation": "visualization",
         | 
| 1695 | 
            +
              "visualisations": "visualizations",
         | 
| 1696 | 
            +
              "visualise": "visualize",
         | 
| 1697 | 
            +
              "visualised": "visualized",
         | 
| 1698 | 
            +
              "visualises": "visualizes",
         | 
| 1699 | 
            +
              "visualising": "visualizing",
         | 
| 1700 | 
            +
              "vocalisation": "vocalization",
         | 
| 1701 | 
            +
              "vocalisations": "vocalizations",
         | 
| 1702 | 
            +
              "vocalise": "vocalize",
         | 
| 1703 | 
            +
              "vocalised": "vocalized",
         | 
| 1704 | 
            +
              "vocalises": "vocalizes",
         | 
| 1705 | 
            +
              "vocalising": "vocalizing",
         | 
| 1706 | 
            +
              "vulcanised": "vulcanized",
         | 
| 1707 | 
            +
              "vulgarisation": "vulgarization",
         | 
| 1708 | 
            +
              "vulgarise": "vulgarize",
         | 
| 1709 | 
            +
              "vulgarised": "vulgarized",
         | 
| 1710 | 
            +
              "vulgarises": "vulgarizes",
         | 
| 1711 | 
            +
              "vulgarising": "vulgarizing",
         | 
| 1712 | 
            +
              "waggon": "wagon",
         | 
| 1713 | 
            +
              "waggons": "wagons",
         | 
| 1714 | 
            +
              "watercolour": "watercolor",
         | 
| 1715 | 
            +
              "watercolours": "watercolors",
         | 
| 1716 | 
            +
              "weaselled": "weaseled",
         | 
| 1717 | 
            +
              "weaselling": "weaseling",
         | 
| 1718 | 
            +
              "westernisation": "westernization",
         | 
| 1719 | 
            +
              "westernise": "westernize",
         | 
| 1720 | 
            +
              "westernised": "westernized",
         | 
| 1721 | 
            +
              "westernises": "westernizes",
         | 
| 1722 | 
            +
              "westernising": "westernizing",
         | 
| 1723 | 
            +
              "womanise": "womanize",
         | 
| 1724 | 
            +
              "womanised": "womanized",
         | 
| 1725 | 
            +
              "womaniser": "womanizer",
         | 
| 1726 | 
            +
              "womanisers": "womanizers",
         | 
| 1727 | 
            +
              "womanises": "womanizes",
         | 
| 1728 | 
            +
              "womanising": "womanizing",
         | 
| 1729 | 
            +
              "woollen": "woolen",
         | 
| 1730 | 
            +
              "woollens": "woolens",
         | 
| 1731 | 
            +
              "woollies": "woolies",
         | 
| 1732 | 
            +
              "woolly": "wooly",
         | 
| 1733 | 
            +
              "worshipped": "worshiped",
         | 
| 1734 | 
            +
              "worshipper": "worshiper",
         | 
| 1735 | 
            +
              "worshipping": "worshiping",
         | 
| 1736 | 
            +
              "yodelled": "yodeled",
         | 
| 1737 | 
            +
              "yodelling": "yodeling",
         | 
| 1738 | 
            +
              "yoghourt": "yogurt",
         | 
| 1739 | 
            +
              "yoghourts": "yogurts",
         | 
| 1740 | 
            +
              "yoghurt": "yogurt",
         | 
| 1741 | 
            +
              "yoghurts": "yogurts"
         | 
| 1742 | 
            +
            }
         | 
    	
        preprocess_dataset.py
    ADDED
    
    | @@ -0,0 +1,89 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from datasets import load_dataset, DatasetDict
         | 
| 2 | 
            +
            from transformers import WhisperFeatureExtractor
         | 
| 3 | 
            +
            from transformers import WhisperTokenizer
         | 
| 4 | 
            +
            from transformers import WhisperProcessor
         | 
| 5 | 
            +
            from datasets import Audio
         | 
| 6 | 
            +
            from transformers.models.whisper.english_normalizer import BasicTextNormalizer
         | 
| 7 | 
            +
            from huggingface_hub import login
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            import argparse
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            my_parser = argparse.ArgumentParser()
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            my_parser.add_argument(
         | 
| 14 | 
            +
                "--model_name",
         | 
| 15 | 
            +
                "-model_name",
         | 
| 16 | 
            +
                type=str,
         | 
| 17 | 
            +
                action="store",
         | 
| 18 | 
            +
                default="openai/whisper-tiny",
         | 
| 19 | 
            +
            )
         | 
| 20 | 
            +
            my_parser.add_argument("--hf_token", "-hf_token", type=str, action="store")
         | 
| 21 | 
            +
            my_parser.add_argument(
         | 
| 22 | 
            +
                "--dataset_name", "-dataset_name", type=str, action="store", default="google/fleurs"
         | 
| 23 | 
            +
            )
         | 
| 24 | 
            +
            my_parser.add_argument("--split", "-split", type=str, action="store", default="test")
         | 
| 25 | 
            +
            my_parser.add_argument("--subset", "-subset", type=str, action="store")
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            args = my_parser.parse_args()
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            dataset_name = args.dataset_name
         | 
| 30 | 
            +
            model_name = args.model_name
         | 
| 31 | 
            +
            subset = args.subset
         | 
| 32 | 
            +
            hf_token = args.hf_token
         | 
| 33 | 
            +
            login(hf_token)
         | 
| 34 | 
            +
            text_column = "sentence"
         | 
| 35 | 
            +
            if dataset_name == "google/fleurs":
         | 
| 36 | 
            +
                text_column = "transcription"
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            do_lower_case = False
         | 
| 39 | 
            +
            do_remove_punctuation = False
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            normalizer = BasicTextNormalizer()
         | 
| 42 | 
            +
            processor = WhisperProcessor.from_pretrained(
         | 
| 43 | 
            +
                model_name, language="Arabic", task="transcribe"
         | 
| 44 | 
            +
            )
         | 
| 45 | 
            +
            dataset = load_dataset(dataset_name, subset, use_auth_token=True)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            print(dataset)
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            tokenizer = WhisperTokenizer.from_pretrained(
         | 
| 52 | 
            +
                model_name, language="Arabic", task="transcribe"
         | 
| 53 | 
            +
            )
         | 
| 54 | 
            +
            dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
         | 
| 55 | 
            +
             | 
| 56 | 
            +
             | 
| 57 | 
            +
            def prepare_dataset(batch):
         | 
| 58 | 
            +
                # load and (possibly) resample audio data to 16kHz
         | 
| 59 | 
            +
                audio = batch["audio"]
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                # compute log-Mel input features from input audio array
         | 
| 62 | 
            +
                batch["input_features"] = processor.feature_extractor(
         | 
| 63 | 
            +
                    audio["array"], sampling_rate=audio["sampling_rate"]
         | 
| 64 | 
            +
                ).input_features[0]
         | 
| 65 | 
            +
                # compute input length of audio sample in seconds
         | 
| 66 | 
            +
                batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                # optional pre-processing steps
         | 
| 69 | 
            +
                transcription = batch[text_column]
         | 
| 70 | 
            +
                if do_lower_case:
         | 
| 71 | 
            +
                    transcription = transcription.lower()
         | 
| 72 | 
            +
                if do_remove_punctuation:
         | 
| 73 | 
            +
                    transcription = normalizer(transcription).strip()
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                # encode target text to label ids
         | 
| 76 | 
            +
                batch["labels"] = processor.tokenizer(transcription).input_ids
         | 
| 77 | 
            +
                return batch
         | 
| 78 | 
            +
             | 
| 79 | 
            +
             | 
| 80 | 
            +
            dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"])
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            login(hf_token)
         | 
| 83 | 
            +
            print(
         | 
| 84 | 
            +
                f"pushing to arbml/{dataset_name.split('/')[-1]}_preprocessed_{model_name.split('/')[-1]}"
         | 
| 85 | 
            +
            )
         | 
| 86 | 
            +
            dataset.push_to_hub(
         | 
| 87 | 
            +
                f"arbml/{dataset_name.split('/')[-1]}_preprocessed_{model_name.split('/')[-1]}",
         | 
| 88 | 
            +
                private=True,
         | 
| 89 | 
            +
            )
         | 
    	
        preprocessor_config.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:fe6c3c8428592f02092883c1cc0b29af6a816cb82a70d648ac0e5224b13512e7
         | 
| 3 | 
            +
            size 3055754841
         | 
    	
        run.sh
    ADDED
    
    | @@ -0,0 +1,38 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            python run_speech_recognition_seq2seq_streaming.py \
         | 
| 2 | 
            +
            --model_name_or_path="openai/whisper-small" \
         | 
| 3 | 
            +
            --dataset_name="mozilla-foundation/common_voice_11_0" \
         | 
| 4 | 
            +
            --dataset_config_name="ar" \
         | 
| 5 | 
            +
            --language="Arabic" \
         | 
| 6 | 
            +
            --train_split_name="train+validation" \
         | 
| 7 | 
            +
            --eval_split_name="test" \
         | 
| 8 | 
            +
            --model_index_name="Whisper Small Arabic" \
         | 
| 9 | 
            +
            --max_steps="5000" \
         | 
| 10 | 
            +
            --output_dir="./" \
         | 
| 11 | 
            +
            --per_device_train_batch_size="64" \
         | 
| 12 | 
            +
            --per_device_eval_batch_size="32" \
         | 
| 13 | 
            +
            --logging_steps="25" \
         | 
| 14 | 
            +
            --learning_rate="1e-5" \
         | 
| 15 | 
            +
            --warmup_steps="500" \
         | 
| 16 | 
            +
            --evaluation_strategy="steps" \
         | 
| 17 | 
            +
            --eval_steps="1000" \
         | 
| 18 | 
            +
            --save_strategy="steps" \
         | 
| 19 | 
            +
            --save_steps="1000" \
         | 
| 20 | 
            +
            --generation_max_length="225" \
         | 
| 21 | 
            +
            --length_column_name="input_length" \
         | 
| 22 | 
            +
            --max_duration_in_seconds="30" \
         | 
| 23 | 
            +
            --text_column_name="sentence" \
         | 
| 24 | 
            +
            --freeze_feature_encoder="False" \
         | 
| 25 | 
            +
            --report_to="tensorboard" \
         | 
| 26 | 
            +
            --metric_for_best_model="wer" \
         | 
| 27 | 
            +
            --greater_is_better="False" \
         | 
| 28 | 
            +
            --load_best_model_at_end \
         | 
| 29 | 
            +
            --gradient_checkpointing \
         | 
| 30 | 
            +
            --fp16 \
         | 
| 31 | 
            +
            --overwrite_output_dir \
         | 
| 32 | 
            +
            --optim="adamw_bnb_8bit" \
         | 
| 33 | 
            +
            --do_train \
         | 
| 34 | 
            +
            --do_eval \
         | 
| 35 | 
            +
            --predict_with_generate \
         | 
| 36 | 
            +
            --do_normalize_eval \
         | 
| 37 | 
            +
            --use_auth_token \
         | 
| 38 | 
            +
            --push_to_hub
         | 
    	
        run_eval_whisper_streaming.py
    ADDED
    
    | @@ -0,0 +1,166 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import argparse
         | 
| 2 | 
            +
            import pyarabic.araby as araby
         | 
| 3 | 
            +
            from transformers import pipeline
         | 
| 4 | 
            +
            from transformers.models.whisper.english_normalizer import BasicTextNormalizer
         | 
| 5 | 
            +
            from datasets import load_dataset, Audio
         | 
| 6 | 
            +
            import evaluate
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            wer_metric = evaluate.load("wer")
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            def is_target_text_in_range(ref):
         | 
| 12 | 
            +
                if ref.strip() == "ignore time segment in scoring":
         | 
| 13 | 
            +
                    return False
         | 
| 14 | 
            +
                else:
         | 
| 15 | 
            +
                    return ref.strip() != ""
         | 
| 16 | 
            +
             | 
| 17 | 
            +
             | 
| 18 | 
            +
            def get_text(sample):
         | 
| 19 | 
            +
                if "text" in sample:
         | 
| 20 | 
            +
                    return sample["text"]
         | 
| 21 | 
            +
                elif "sentence" in sample:
         | 
| 22 | 
            +
                    return sample["sentence"]
         | 
| 23 | 
            +
                elif "normalized_text" in sample:
         | 
| 24 | 
            +
                    return sample["normalized_text"]
         | 
| 25 | 
            +
                elif "transcript" in sample:
         | 
| 26 | 
            +
                    return sample["transcript"]
         | 
| 27 | 
            +
                elif "transcription" in sample:
         | 
| 28 | 
            +
                    return sample["transcription"]
         | 
| 29 | 
            +
                else:
         | 
| 30 | 
            +
                    raise ValueError(
         | 
| 31 | 
            +
                        f"Expected transcript column of either 'text', 'sentence', 'normalized_text' or 'transcript'. Got sample of "
         | 
| 32 | 
            +
                        ".join{sample.keys()}. Ensure a text column name is present in the dataset."
         | 
| 33 | 
            +
                    )
         | 
| 34 | 
            +
             | 
| 35 | 
            +
             | 
| 36 | 
            +
            whisper_norm = BasicTextNormalizer()
         | 
| 37 | 
            +
             | 
| 38 | 
            +
             | 
| 39 | 
            +
            def normalise(batch):
         | 
| 40 | 
            +
                batch["norm_text"] = whisper_norm(get_text(batch))
         | 
| 41 | 
            +
                return batch
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
            def remove_diacritics(batch):
         | 
| 45 | 
            +
                batch["norm_text"] = araby.strip_diacritics(get_text(batch))
         | 
| 46 | 
            +
                return batch
         | 
| 47 | 
            +
             | 
| 48 | 
            +
             | 
| 49 | 
            +
            def data(dataset):
         | 
| 50 | 
            +
                for i, item in enumerate(dataset):
         | 
| 51 | 
            +
                    yield {**item["audio"], "reference": item["norm_text"]}
         | 
| 52 | 
            +
             | 
| 53 | 
            +
             | 
| 54 | 
            +
            def main(args):
         | 
| 55 | 
            +
                batch_size = args.batch_size
         | 
| 56 | 
            +
                whisper_asr = pipeline(
         | 
| 57 | 
            +
                    "automatic-speech-recognition", model=args.model_id, device=args.device
         | 
| 58 | 
            +
                )
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                whisper_asr.model.config.forced_decoder_ids = (
         | 
| 61 | 
            +
                    whisper_asr.tokenizer.get_decoder_prompt_ids(
         | 
| 62 | 
            +
                        language=args.language, task="transcribe"
         | 
| 63 | 
            +
                    )
         | 
| 64 | 
            +
                )
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                dataset = load_dataset(
         | 
| 67 | 
            +
                    args.dataset,
         | 
| 68 | 
            +
                    args.config,
         | 
| 69 | 
            +
                    split=args.split,
         | 
| 70 | 
            +
                    streaming=args.streaming,
         | 
| 71 | 
            +
                    use_auth_token=True,
         | 
| 72 | 
            +
                )
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                # Only uncomment for debugging
         | 
| 75 | 
            +
                dataset = dataset.take(args.max_eval_samples)
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
         | 
| 78 | 
            +
                dataset = dataset.map(normalise)
         | 
| 79 | 
            +
                if args.remove_diacritics:
         | 
| 80 | 
            +
                    print("stripping diacritics")
         | 
| 81 | 
            +
                    dataset = dataset.map(remove_diacritics)
         | 
| 82 | 
            +
                dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"])
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                predictions = []
         | 
| 85 | 
            +
                references = []
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                # run streamed inference
         | 
| 88 | 
            +
                for out in whisper_asr(data(dataset), batch_size=batch_size):
         | 
| 89 | 
            +
                    predictions.append(whisper_norm(out["text"]))
         | 
| 90 | 
            +
                    references.append(out["reference"][0])
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                wer = wer_metric.compute(references=references, predictions=predictions)
         | 
| 93 | 
            +
                wer = round(100 * wer, 2)
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                print("WER:", wer)
         | 
| 96 | 
            +
             | 
| 97 | 
            +
             | 
| 98 | 
            +
            if __name__ == "__main__":
         | 
| 99 | 
            +
                parser = argparse.ArgumentParser()
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                parser.add_argument(
         | 
| 102 | 
            +
                    "--model_id",
         | 
| 103 | 
            +
                    type=str,
         | 
| 104 | 
            +
                    required=True,
         | 
| 105 | 
            +
                    help="Model identifier. Should be loadable with 🤗 Transformers",
         | 
| 106 | 
            +
                )
         | 
| 107 | 
            +
                parser.add_argument(
         | 
| 108 | 
            +
                    "--dataset",
         | 
| 109 | 
            +
                    type=str,
         | 
| 110 | 
            +
                    default="mozilla-foundation/common_voice_11_0",
         | 
| 111 | 
            +
                    help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
         | 
| 112 | 
            +
                )
         | 
| 113 | 
            +
                parser.add_argument(
         | 
| 114 | 
            +
                    "--config",
         | 
| 115 | 
            +
                    type=str,
         | 
| 116 | 
            +
                    required=True,
         | 
| 117 | 
            +
                    help="Config of the dataset. *E.g.* `'en'` for the English split of Common Voice",
         | 
| 118 | 
            +
                )
         | 
| 119 | 
            +
                parser.add_argument(
         | 
| 120 | 
            +
                    "--split",
         | 
| 121 | 
            +
                    type=str,
         | 
| 122 | 
            +
                    default="test",
         | 
| 123 | 
            +
                    help="Split of the dataset. *E.g.* `'test'`",
         | 
| 124 | 
            +
                )
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                parser.add_argument(
         | 
| 127 | 
            +
                    "--device",
         | 
| 128 | 
            +
                    type=int,
         | 
| 129 | 
            +
                    default=-1,
         | 
| 130 | 
            +
                    help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
         | 
| 131 | 
            +
                )
         | 
| 132 | 
            +
                parser.add_argument(
         | 
| 133 | 
            +
                    "--batch_size",
         | 
| 134 | 
            +
                    type=int,
         | 
| 135 | 
            +
                    default=16,
         | 
| 136 | 
            +
                    help="Number of samples to go through each streamed batch.",
         | 
| 137 | 
            +
                )
         | 
| 138 | 
            +
                parser.add_argument(
         | 
| 139 | 
            +
                    "--max_eval_samples",
         | 
| 140 | 
            +
                    type=int,
         | 
| 141 | 
            +
                    default=None,
         | 
| 142 | 
            +
                    help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.",
         | 
| 143 | 
            +
                )
         | 
| 144 | 
            +
                parser.add_argument(
         | 
| 145 | 
            +
                    "--streaming",
         | 
| 146 | 
            +
                    type=bool,
         | 
| 147 | 
            +
                    default=True,
         | 
| 148 | 
            +
                    help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.",
         | 
| 149 | 
            +
                )
         | 
| 150 | 
            +
                parser.add_argument(
         | 
| 151 | 
            +
                    "--language",
         | 
| 152 | 
            +
                    type=str,
         | 
| 153 | 
            +
                    required=True,
         | 
| 154 | 
            +
                    help="Two letter language code for the transcription language, e.g. use 'en' for English.",
         | 
| 155 | 
            +
                )
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                parser.add_argument(
         | 
| 158 | 
            +
                    "--remove_diacritics",
         | 
| 159 | 
            +
                    type=bool,
         | 
| 160 | 
            +
                    default=False,
         | 
| 161 | 
            +
                    help="Choose whether you'd like remove_diacritics",
         | 
| 162 | 
            +
                )
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                args = parser.parse_args()
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                main(args)
         | 
    	
        run_mgb2.sh
    ADDED
    
    | @@ -0,0 +1,37 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            python run_speech_recognition_seq2seq_mixed_mgb2.py \
         | 
| 2 | 
            +
            --model_name_or_path="openai/whisper-medium" \
         | 
| 3 | 
            +
            --dataset_name="arbml/mgb2_speech" \
         | 
| 4 | 
            +
            --dataset_config_name="ar" \
         | 
| 5 | 
            +
            --language="Arabic" \
         | 
| 6 | 
            +
            --train_split_name="train" \
         | 
| 7 | 
            +
            --eval_split_name="test" \
         | 
| 8 | 
            +
            --model_index_name="Whisper Small Arabic" \
         | 
| 9 | 
            +
            --max_steps="20000" \
         | 
| 10 | 
            +
            --output_dir="./" \
         | 
| 11 | 
            +
            --per_device_train_batch_size="32" \
         | 
| 12 | 
            +
            --per_device_eval_batch_size="16" \
         | 
| 13 | 
            +
            --logging_steps="25" \
         | 
| 14 | 
            +
            --learning_rate="1e-5" \
         | 
| 15 | 
            +
            --warmup_steps="500" \
         | 
| 16 | 
            +
            --evaluation_strategy="steps" \
         | 
| 17 | 
            +
            --eval_steps="1000" \
         | 
| 18 | 
            +
            --save_strategy="steps" \
         | 
| 19 | 
            +
            --save_steps="1000" \
         | 
| 20 | 
            +
            --generation_max_length="225" \
         | 
| 21 | 
            +
            --length_column_name="input_length" \
         | 
| 22 | 
            +
            --max_duration_in_seconds="30" \
         | 
| 23 | 
            +
            --text_column_name="text" \
         | 
| 24 | 
            +
            --freeze_feature_encoder="False" \
         | 
| 25 | 
            +
            --report_to="tensorboard" \
         | 
| 26 | 
            +
            --metric_for_best_model="wer" \
         | 
| 27 | 
            +
            --greater_is_better="False" \
         | 
| 28 | 
            +
            --load_best_model_at_end \
         | 
| 29 | 
            +
            --gradient_checkpointing \
         | 
| 30 | 
            +
            --fp16 \
         | 
| 31 | 
            +
            --overwrite_output_dir \
         | 
| 32 | 
            +
            --do_train \
         | 
| 33 | 
            +
            --do_eval \
         | 
| 34 | 
            +
            --predict_with_generate \
         | 
| 35 | 
            +
            --do_normalize_eval \
         | 
| 36 | 
            +
            --use_auth_token \
         | 
| 37 | 
            +
            --push_to_hub
         | 
    	
        run_speech_recognition_seq2seq.py
    ADDED
    
    | @@ -0,0 +1,607 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python
         | 
| 2 | 
            +
            # coding=utf-8
         | 
| 3 | 
            +
            # Copyright 2022 The HuggingFace Team. All rights reserved.
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 6 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 7 | 
            +
            # You may obtain a copy of the License at
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 12 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 13 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 14 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 15 | 
            +
            # limitations under the License.
         | 
| 16 | 
            +
            """
         | 
| 17 | 
            +
            Fine-tuning the library models for sequence to sequence speech recognition.
         | 
| 18 | 
            +
            """
         | 
| 19 | 
            +
            # You can also adapt this script for your own sequence to sequence speech
         | 
| 20 | 
            +
            # recognition task. Pointers for this are left as comments.
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            import logging
         | 
| 23 | 
            +
            import os
         | 
| 24 | 
            +
            import sys
         | 
| 25 | 
            +
            from dataclasses import dataclass, field
         | 
| 26 | 
            +
            from typing import Any, Dict, List, Optional, Union
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            import datasets
         | 
| 29 | 
            +
            import torch
         | 
| 30 | 
            +
            from datasets import IterableDatasetDict, interleave_datasets, load_dataset
         | 
| 31 | 
            +
            from torch.utils.data import IterableDataset
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            import evaluate
         | 
| 34 | 
            +
            import transformers
         | 
| 35 | 
            +
            from transformers import (
         | 
| 36 | 
            +
                AutoConfig,
         | 
| 37 | 
            +
                AutoFeatureExtractor,
         | 
| 38 | 
            +
                AutoModelForSpeechSeq2Seq,
         | 
| 39 | 
            +
                AutoProcessor,
         | 
| 40 | 
            +
                AutoTokenizer,
         | 
| 41 | 
            +
                HfArgumentParser,
         | 
| 42 | 
            +
                Seq2SeqTrainer,
         | 
| 43 | 
            +
                Seq2SeqTrainingArguments,
         | 
| 44 | 
            +
                TrainerCallback,
         | 
| 45 | 
            +
                set_seed,
         | 
| 46 | 
            +
            )
         | 
| 47 | 
            +
            from transformers.trainer_pt_utils import IterableDatasetShard
         | 
| 48 | 
            +
            from transformers.trainer_utils import get_last_checkpoint, is_main_process
         | 
| 49 | 
            +
            from transformers.utils import check_min_version, send_example_telemetry
         | 
| 50 | 
            +
            from transformers.utils.versions import require_version
         | 
| 51 | 
            +
            from transformers.models.whisper.english_normalizer import BasicTextNormalizer
         | 
| 52 | 
            +
            import os
         | 
| 53 | 
            +
            os.environ['LD_LIBRARY_PATH'] = '/usr/lib/x86_64-linux-gnu'
         | 
| 54 | 
            +
            # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
         | 
| 55 | 
            +
            check_min_version("4.25.0.dev0")
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            require_version("datasets>=1.18.2", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            logger = logging.getLogger(__name__)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
             | 
| 62 | 
            +
            @dataclass
         | 
| 63 | 
            +
            class ModelArguments:
         | 
| 64 | 
            +
                """
         | 
| 65 | 
            +
                Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
         | 
| 66 | 
            +
                """
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                model_name_or_path: str = field(
         | 
| 69 | 
            +
                    metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
         | 
| 70 | 
            +
                )
         | 
| 71 | 
            +
                config_name: Optional[str] = field(
         | 
| 72 | 
            +
                    default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
         | 
| 73 | 
            +
                )
         | 
| 74 | 
            +
                tokenizer_name: Optional[str] = field(
         | 
| 75 | 
            +
                    default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
         | 
| 76 | 
            +
                )
         | 
| 77 | 
            +
                feature_extractor_name: Optional[str] = field(
         | 
| 78 | 
            +
                    default=None, metadata={"help": "feature extractor name or path if not the same as model_name"}
         | 
| 79 | 
            +
                )
         | 
| 80 | 
            +
                cache_dir: Optional[str] = field(
         | 
| 81 | 
            +
                    default=None,
         | 
| 82 | 
            +
                    metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
         | 
| 83 | 
            +
                )
         | 
| 84 | 
            +
                use_fast_tokenizer: bool = field(
         | 
| 85 | 
            +
                    default=True,
         | 
| 86 | 
            +
                    metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
         | 
| 87 | 
            +
                )
         | 
| 88 | 
            +
                model_revision: str = field(
         | 
| 89 | 
            +
                    default="main",
         | 
| 90 | 
            +
                    metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
         | 
| 91 | 
            +
                )
         | 
| 92 | 
            +
                use_auth_token: bool = field(
         | 
| 93 | 
            +
                    default=False,
         | 
| 94 | 
            +
                    metadata={
         | 
| 95 | 
            +
                        "help": (
         | 
| 96 | 
            +
                            "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
         | 
| 97 | 
            +
                            "with private models)."
         | 
| 98 | 
            +
                        )
         | 
| 99 | 
            +
                    },
         | 
| 100 | 
            +
                )
         | 
| 101 | 
            +
                freeze_feature_encoder: bool = field(
         | 
| 102 | 
            +
                    default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
         | 
| 103 | 
            +
                )
         | 
| 104 | 
            +
                freeze_encoder: bool = field(
         | 
| 105 | 
            +
                    default=False, metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."}
         | 
| 106 | 
            +
                )
         | 
| 107 | 
            +
                forced_decoder_ids: List[List[int]] = field(
         | 
| 108 | 
            +
                    default=None,
         | 
| 109 | 
            +
                    metadata={
         | 
| 110 | 
            +
                        "help": (
         | 
| 111 | 
            +
                            "A list of pairs of integers which indicates a mapping from generation indices to token indices "
         | 
| 112 | 
            +
                            "that will be forced before sampling. For example, [[0, 123]] means the first generated token "
         | 
| 113 | 
            +
                            "will always be a token of index 123."
         | 
| 114 | 
            +
                        )
         | 
| 115 | 
            +
                    },
         | 
| 116 | 
            +
                )
         | 
| 117 | 
            +
                suppress_tokens: List[int] = field(
         | 
| 118 | 
            +
                    default=None, metadata={"help": "A list of tokens that will be suppressed at generation."}
         | 
| 119 | 
            +
                )
         | 
| 120 | 
            +
                model_index_name: str = field(default=None, metadata={"help": "Pretty name for the model card."})
         | 
| 121 | 
            +
             | 
| 122 | 
            +
             | 
| 123 | 
            +
            @dataclass
         | 
| 124 | 
            +
            class DataTrainingArguments:
         | 
| 125 | 
            +
                """
         | 
| 126 | 
            +
                Arguments pertaining to what data we are going to input our model for training and eval.
         | 
| 127 | 
            +
                """
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                dataset_name: str = field(
         | 
| 130 | 
            +
                    default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
         | 
| 131 | 
            +
                )
         | 
| 132 | 
            +
                dataset_config_name: Optional[str] = field(
         | 
| 133 | 
            +
                    default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
         | 
| 134 | 
            +
                )
         | 
| 135 | 
            +
                text_column: Optional[str] = field(
         | 
| 136 | 
            +
                    default=None,
         | 
| 137 | 
            +
                    metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
         | 
| 138 | 
            +
                )
         | 
| 139 | 
            +
                max_train_samples: Optional[int] = field(
         | 
| 140 | 
            +
                    default=None,
         | 
| 141 | 
            +
                    metadata={
         | 
| 142 | 
            +
                        "help": (
         | 
| 143 | 
            +
                            "For debugging purposes or quicker training, truncate the number of training examples to this "
         | 
| 144 | 
            +
                            "value if set."
         | 
| 145 | 
            +
                        )
         | 
| 146 | 
            +
                    },
         | 
| 147 | 
            +
                )
         | 
| 148 | 
            +
                max_eval_samples: Optional[int] = field(
         | 
| 149 | 
            +
                    default=None,
         | 
| 150 | 
            +
                    metadata={
         | 
| 151 | 
            +
                        "help": (
         | 
| 152 | 
            +
                            "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
         | 
| 153 | 
            +
                            "value if set."
         | 
| 154 | 
            +
                        )
         | 
| 155 | 
            +
                    },
         | 
| 156 | 
            +
                )
         | 
| 157 | 
            +
                audio_column_name: str = field(
         | 
| 158 | 
            +
                    default="audio",
         | 
| 159 | 
            +
                    metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
         | 
| 160 | 
            +
                )
         | 
| 161 | 
            +
                text_column_name: str = field(
         | 
| 162 | 
            +
                    default="text",
         | 
| 163 | 
            +
                    metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
         | 
| 164 | 
            +
                )
         | 
| 165 | 
            +
                max_duration_in_seconds: float = field(
         | 
| 166 | 
            +
                    default=20.0,
         | 
| 167 | 
            +
                    metadata={
         | 
| 168 | 
            +
                        "help": (
         | 
| 169 | 
            +
                            "Truncate audio files that are longer than `max_duration_in_seconds` seconds to"
         | 
| 170 | 
            +
                            " 'max_duration_in_seconds`"
         | 
| 171 | 
            +
                        )
         | 
| 172 | 
            +
                    },
         | 
| 173 | 
            +
                )
         | 
| 174 | 
            +
                min_duration_in_seconds: float = field(
         | 
| 175 | 
            +
                    default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
         | 
| 176 | 
            +
                )
         | 
| 177 | 
            +
                train_split_name: str = field(
         | 
| 178 | 
            +
                    default="train",
         | 
| 179 | 
            +
                    metadata={
         | 
| 180 | 
            +
                        "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
         | 
| 181 | 
            +
                    },
         | 
| 182 | 
            +
                )
         | 
| 183 | 
            +
                eval_split_name: str = field(
         | 
| 184 | 
            +
                    default="test",
         | 
| 185 | 
            +
                    metadata={
         | 
| 186 | 
            +
                        "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
         | 
| 187 | 
            +
                    },
         | 
| 188 | 
            +
                )
         | 
| 189 | 
            +
                do_lower_case: bool = field(
         | 
| 190 | 
            +
                    default=False,
         | 
| 191 | 
            +
                    metadata={"help": "Whether the target text should be lower cased."},
         | 
| 192 | 
            +
                )
         | 
| 193 | 
            +
                do_remove_punctuation: bool = field(
         | 
| 194 | 
            +
                    default=False,
         | 
| 195 | 
            +
                    metadata={"help": "Whether the target text should be striped of punctuation."},
         | 
| 196 | 
            +
                )
         | 
| 197 | 
            +
                do_normalize_eval: bool = field(
         | 
| 198 | 
            +
                    default=True,
         | 
| 199 | 
            +
                    metadata={"help": "Whether to normalise the references and predictions in the eval WER calculation."},
         | 
| 200 | 
            +
                )
         | 
| 201 | 
            +
                language: str = field(
         | 
| 202 | 
            +
                    default=None,
         | 
| 203 | 
            +
                    metadata={
         | 
| 204 | 
            +
                        "help": (
         | 
| 205 | 
            +
                            "Language for multilingual fine-tuning. This argument should be set for multilingual fine-tuning "
         | 
| 206 | 
            +
                            "only. For English speech recognition, it should be set to `None`."
         | 
| 207 | 
            +
                        )
         | 
| 208 | 
            +
                    },
         | 
| 209 | 
            +
                )
         | 
| 210 | 
            +
                task: str = field(
         | 
| 211 | 
            +
                    default="transcribe",
         | 
| 212 | 
            +
                    metadata={"help": "Task, either `transcribe` for speech recognition or `translate` for speech translation."},
         | 
| 213 | 
            +
                )
         | 
| 214 | 
            +
                shuffle_buffer_size: Optional[int] = field(
         | 
| 215 | 
            +
                    default=500,
         | 
| 216 | 
            +
                    metadata={
         | 
| 217 | 
            +
                        "help": (
         | 
| 218 | 
            +
                            "The number of streamed examples to download before shuffling them. The large the buffer, "
         | 
| 219 | 
            +
                            "the closer it is to real offline shuffling."
         | 
| 220 | 
            +
                        )
         | 
| 221 | 
            +
                    },
         | 
| 222 | 
            +
                )
         | 
| 223 | 
            +
             | 
| 224 | 
            +
             | 
| 225 | 
            +
            @dataclass
         | 
| 226 | 
            +
            class DataCollatorSpeechSeq2SeqWithPadding:
         | 
| 227 | 
            +
                """
         | 
| 228 | 
            +
                Data collator that will dynamically pad the inputs received.
         | 
| 229 | 
            +
                Args:
         | 
| 230 | 
            +
                    processor ([`WhisperProcessor`])
         | 
| 231 | 
            +
                        The processor used for processing the data.
         | 
| 232 | 
            +
                    decoder_start_token_id (`int`)
         | 
| 233 | 
            +
                        The begin-of-sentence of the decoder.
         | 
| 234 | 
            +
                """
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                processor: Any
         | 
| 237 | 
            +
                decoder_start_token_id: int
         | 
| 238 | 
            +
             | 
| 239 | 
            +
                def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
         | 
| 240 | 
            +
                    # split inputs and labels since they have to be of different lengths and need
         | 
| 241 | 
            +
                    # different padding methods
         | 
| 242 | 
            +
                    model_input_name = self.processor.model_input_names[0]
         | 
| 243 | 
            +
                    input_features = [{model_input_name: feature[model_input_name]} for feature in features]
         | 
| 244 | 
            +
                    label_features = [{"input_ids": feature["labels"]} for feature in features]
         | 
| 245 | 
            +
             | 
| 246 | 
            +
                    batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
         | 
| 247 | 
            +
             | 
| 248 | 
            +
                    labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                    # replace padding with -100 to ignore loss correctly
         | 
| 251 | 
            +
                    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                    # if bos token is appended in previous tokenization step,
         | 
| 254 | 
            +
                    # cut bos token here as it's append later anyways
         | 
| 255 | 
            +
                    if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
         | 
| 256 | 
            +
                        labels = labels[:, 1:]
         | 
| 257 | 
            +
             | 
| 258 | 
            +
                    batch["labels"] = labels
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                    return batch
         | 
| 261 | 
            +
             | 
| 262 | 
            +
             | 
| 263 | 
            +
            def load_datasets(dataset_name, dataset_config_name, split="train", **kwargs):
         | 
| 264 | 
            +
                """
         | 
| 265 | 
            +
                Utility function to load a dataset in streaming mode. For datasets with multiple splits,
         | 
| 266 | 
            +
                each split is loaded individually and then splits combined by taking alternating examples from
         | 
| 267 | 
            +
                each (interleaving).
         | 
| 268 | 
            +
                """
         | 
| 269 | 
            +
                if "+" in split:
         | 
| 270 | 
            +
                    # load multiple splits separated by the `+` symbol with streaming mode
         | 
| 271 | 
            +
                    dataset_splits = [
         | 
| 272 | 
            +
                        load_dataset(dataset_name, dataset_config_name, split=split_name, **kwargs)
         | 
| 273 | 
            +
                        for split_name in split.split("+")
         | 
| 274 | 
            +
                    ]
         | 
| 275 | 
            +
                    # interleave multiple splits to form one dataset
         | 
| 276 | 
            +
                    interleaved_dataset = interleave_datasets(dataset_splits)
         | 
| 277 | 
            +
                    return interleaved_dataset
         | 
| 278 | 
            +
                else:
         | 
| 279 | 
            +
                    # load a single split *with* streaming mode
         | 
| 280 | 
            +
                    dataset = load_dataset(dataset_name, dataset_config_name, split=split, **kwargs)
         | 
| 281 | 
            +
                    return dataset
         | 
| 282 | 
            +
             | 
| 283 | 
            +
             | 
| 284 | 
            +
            def main():
         | 
| 285 | 
            +
                # 1. Parse input arguments
         | 
| 286 | 
            +
                # See all possible arguments in src/transformers/training_args.py
         | 
| 287 | 
            +
                # or by passing the --help flag to this script.
         | 
| 288 | 
            +
                # We now keep distinct sets of args, for a cleaner separation of concerns.
         | 
| 289 | 
            +
                parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
         | 
| 290 | 
            +
             | 
| 291 | 
            +
                if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         | 
| 292 | 
            +
                    # If we pass only one argument to the script and it's the path to a json file,
         | 
| 293 | 
            +
                    # let's parse it to get our arguments.
         | 
| 294 | 
            +
                    model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
         | 
| 295 | 
            +
                else:
         | 
| 296 | 
            +
                    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
         | 
| 297 | 
            +
             | 
| 298 | 
            +
                # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
         | 
| 299 | 
            +
                # information sent is the one passed as arguments along with your Python/PyTorch versions.
         | 
| 300 | 
            +
                send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args)
         | 
| 301 | 
            +
             | 
| 302 | 
            +
                # 2. Setup logging
         | 
| 303 | 
            +
                logging.basicConfig(
         | 
| 304 | 
            +
                    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         | 
| 305 | 
            +
                    datefmt="%m/%d/%Y %H:%M:%S",
         | 
| 306 | 
            +
                    handlers=[logging.StreamHandler(sys.stdout)],
         | 
| 307 | 
            +
                )
         | 
| 308 | 
            +
                log_level = training_args.get_process_log_level()
         | 
| 309 | 
            +
                logger.setLevel(log_level)
         | 
| 310 | 
            +
                datasets.utils.logging.set_verbosity(log_level)
         | 
| 311 | 
            +
                transformers.utils.logging.set_verbosity(log_level)
         | 
| 312 | 
            +
                transformers.utils.logging.enable_default_handler()
         | 
| 313 | 
            +
                transformers.utils.logging.enable_explicit_format()
         | 
| 314 | 
            +
             | 
| 315 | 
            +
                logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
         | 
| 316 | 
            +
             | 
| 317 | 
            +
                # Log on each process the small summary:
         | 
| 318 | 
            +
                logger.warning(
         | 
| 319 | 
            +
                    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
         | 
| 320 | 
            +
                    f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
         | 
| 321 | 
            +
                )
         | 
| 322 | 
            +
                logger.info(f"Training/evaluation parameters {training_args}")
         | 
| 323 | 
            +
             | 
| 324 | 
            +
                # Set the verbosity to info of the Transformers logger (on main process only):
         | 
| 325 | 
            +
                if is_main_process(training_args.local_rank):
         | 
| 326 | 
            +
                    transformers.utils.logging.set_verbosity_info()
         | 
| 327 | 
            +
                logger.info("Training/evaluation parameters %s", training_args)
         | 
| 328 | 
            +
             | 
| 329 | 
            +
                # 3. Detecting last checkpoint and eventually continue from last checkpoint
         | 
| 330 | 
            +
                last_checkpoint = None
         | 
| 331 | 
            +
                if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         | 
| 332 | 
            +
                    last_checkpoint = get_last_checkpoint(training_args.output_dir)
         | 
| 333 | 
            +
                    if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
         | 
| 334 | 
            +
                        raise ValueError(
         | 
| 335 | 
            +
                            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
         | 
| 336 | 
            +
                            "Use --overwrite_output_dir to overcome."
         | 
| 337 | 
            +
                        )
         | 
| 338 | 
            +
                    elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
         | 
| 339 | 
            +
                        logger.info(
         | 
| 340 | 
            +
                            f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
         | 
| 341 | 
            +
                            "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
         | 
| 342 | 
            +
                        )
         | 
| 343 | 
            +
             | 
| 344 | 
            +
                # Set seed before initializing model.
         | 
| 345 | 
            +
                set_seed(training_args.seed)
         | 
| 346 | 
            +
             | 
| 347 | 
            +
                # 4. Load dataset
         | 
| 348 | 
            +
                raw_datasets = IterableDatasetDict()
         | 
| 349 | 
            +
             | 
| 350 | 
            +
                if training_args.do_train:
         | 
| 351 | 
            +
                    raw_datasets["train"] = load_datasets(
         | 
| 352 | 
            +
                        data_args.dataset_name,
         | 
| 353 | 
            +
                        data_args.dataset_config_name,
         | 
| 354 | 
            +
                        split=data_args.train_split_name,
         | 
| 355 | 
            +
                        use_auth_token=True if model_args.use_auth_token else None,
         | 
| 356 | 
            +
                    )
         | 
| 357 | 
            +
             | 
| 358 | 
            +
                if training_args.do_eval:
         | 
| 359 | 
            +
                    raw_datasets["eval"] = load_datasets(
         | 
| 360 | 
            +
                        data_args.dataset_name,
         | 
| 361 | 
            +
                        data_args.dataset_config_name,
         | 
| 362 | 
            +
                        split=data_args.eval_split_name,
         | 
| 363 | 
            +
                        use_auth_token=True if model_args.use_auth_token else None,
         | 
| 364 | 
            +
                    )
         | 
| 365 | 
            +
             | 
| 366 | 
            +
                raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
         | 
| 367 | 
            +
             | 
| 368 | 
            +
                if data_args.audio_column_name not in raw_datasets_features:
         | 
| 369 | 
            +
                    raise ValueError(
         | 
| 370 | 
            +
                        f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
         | 
| 371 | 
            +
                        "Make sure to set `--audio_column_name` to the correct audio column - one of "
         | 
| 372 | 
            +
                        f"{', '.join(raw_datasets_features)}."
         | 
| 373 | 
            +
                    )
         | 
| 374 | 
            +
             | 
| 375 | 
            +
                if data_args.text_column_name not in raw_datasets_features:
         | 
| 376 | 
            +
                    raise ValueError(
         | 
| 377 | 
            +
                        f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
         | 
| 378 | 
            +
                        "Make sure to set `--text_column_name` to the correct text column - one of "
         | 
| 379 | 
            +
                        f"{', '.join(raw_datasets_features)}."
         | 
| 380 | 
            +
                    )
         | 
| 381 | 
            +
             | 
| 382 | 
            +
                # 5. Load pretrained model, tokenizer, and feature extractor
         | 
| 383 | 
            +
                #
         | 
| 384 | 
            +
                # Distributed training:
         | 
| 385 | 
            +
                # The .from_pretrained methods guarantee that only one local process can concurrently
         | 
| 386 | 
            +
                config = AutoConfig.from_pretrained(
         | 
| 387 | 
            +
                    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         | 
| 388 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 389 | 
            +
                    revision=model_args.model_revision,
         | 
| 390 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 391 | 
            +
                )
         | 
| 392 | 
            +
             | 
| 393 | 
            +
                config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
         | 
| 394 | 
            +
             | 
| 395 | 
            +
                feature_extractor = AutoFeatureExtractor.from_pretrained(
         | 
| 396 | 
            +
                    model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
         | 
| 397 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 398 | 
            +
                    revision=model_args.model_revision,
         | 
| 399 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 400 | 
            +
                )
         | 
| 401 | 
            +
                tokenizer = AutoTokenizer.from_pretrained(
         | 
| 402 | 
            +
                    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         | 
| 403 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 404 | 
            +
                    use_fast=model_args.use_fast_tokenizer,
         | 
| 405 | 
            +
                    revision=model_args.model_revision,
         | 
| 406 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 407 | 
            +
                )
         | 
| 408 | 
            +
                model = AutoModelForSpeechSeq2Seq.from_pretrained(
         | 
| 409 | 
            +
                    model_args.model_name_or_path,
         | 
| 410 | 
            +
                    config=config,
         | 
| 411 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 412 | 
            +
                    revision=model_args.model_revision,
         | 
| 413 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 414 | 
            +
                )
         | 
| 415 | 
            +
                model.config.use_cache = False
         | 
| 416 | 
            +
             | 
| 417 | 
            +
                if model.config.decoder_start_token_id is None:
         | 
| 418 | 
            +
                    raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
         | 
| 419 | 
            +
             | 
| 420 | 
            +
                if model_args.freeze_feature_encoder:
         | 
| 421 | 
            +
                    model.freeze_feature_encoder()
         | 
| 422 | 
            +
             | 
| 423 | 
            +
                if model_args.freeze_encoder:
         | 
| 424 | 
            +
                    model.freeze_encoder()
         | 
| 425 | 
            +
                    model.model.encoder.gradient_checkpointing = False
         | 
| 426 | 
            +
             | 
| 427 | 
            +
                if data_args.language is not None:
         | 
| 428 | 
            +
                    # We only need to set the task id when the language is specified (i.e. in a multilingual setting)
         | 
| 429 | 
            +
                    tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
         | 
| 430 | 
            +
             | 
| 431 | 
            +
                # 6. Resample speech dataset if necessary
         | 
| 432 | 
            +
                dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
         | 
| 433 | 
            +
                if dataset_sampling_rate != feature_extractor.sampling_rate:
         | 
| 434 | 
            +
                    raw_datasets = raw_datasets.cast_column(
         | 
| 435 | 
            +
                        data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
         | 
| 436 | 
            +
                    )
         | 
| 437 | 
            +
             | 
| 438 | 
            +
                # 7. Preprocessing the datasets.
         | 
| 439 | 
            +
                # We need to read the audio files as arrays and tokenize the targets.
         | 
| 440 | 
            +
                max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
         | 
| 441 | 
            +
                min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
         | 
| 442 | 
            +
                audio_column_name = data_args.audio_column_name
         | 
| 443 | 
            +
                text_column_name = data_args.text_column_name
         | 
| 444 | 
            +
                model_input_name = feature_extractor.model_input_names[0]
         | 
| 445 | 
            +
                do_lower_case = data_args.do_lower_case
         | 
| 446 | 
            +
                do_remove_punctuation = data_args.do_remove_punctuation
         | 
| 447 | 
            +
                normalizer = BasicTextNormalizer()  # 'official' text normalizer from OpenAI
         | 
| 448 | 
            +
             | 
| 449 | 
            +
                if data_args.max_train_samples is not None:
         | 
| 450 | 
            +
                    raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
         | 
| 451 | 
            +
             | 
| 452 | 
            +
                if data_args.max_eval_samples is not None:
         | 
| 453 | 
            +
                    raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
         | 
| 454 | 
            +
             | 
| 455 | 
            +
                def prepare_dataset(batch):
         | 
| 456 | 
            +
                    # process audio
         | 
| 457 | 
            +
                    sample = batch[audio_column_name]
         | 
| 458 | 
            +
                    inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
         | 
| 459 | 
            +
                    # process audio length
         | 
| 460 | 
            +
                    batch[model_input_name] = inputs.get(model_input_name)[0]
         | 
| 461 | 
            +
                    batch["input_length"] = len(sample["array"])
         | 
| 462 | 
            +
             | 
| 463 | 
            +
                    # process targets
         | 
| 464 | 
            +
                    input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
         | 
| 465 | 
            +
                    if do_remove_punctuation:
         | 
| 466 | 
            +
                        input_str = normalizer(input_str).strip()
         | 
| 467 | 
            +
                    batch["labels"] = tokenizer(input_str).input_ids
         | 
| 468 | 
            +
                    return batch
         | 
| 469 | 
            +
             | 
| 470 | 
            +
                with training_args.main_process_first(desc="dataset map pre-processing"):
         | 
| 471 | 
            +
                    vectorized_datasets = raw_datasets.map(
         | 
| 472 | 
            +
                        prepare_dataset,
         | 
| 473 | 
            +
                        remove_columns=raw_datasets_features,
         | 
| 474 | 
            +
                    ).with_format("torch")
         | 
| 475 | 
            +
             | 
| 476 | 
            +
                    if training_args.do_train:
         | 
| 477 | 
            +
                        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(
         | 
| 478 | 
            +
                            # buffer_size=data_args.shuffle_buffer_size,
         | 
| 479 | 
            +
                            seed=training_args.seed,
         | 
| 480 | 
            +
                        )
         | 
| 481 | 
            +
             | 
| 482 | 
            +
                # filter training data that is shorter than min_input_length or longer than
         | 
| 483 | 
            +
                # max_input_length
         | 
| 484 | 
            +
                def is_audio_in_length_range(length):
         | 
| 485 | 
            +
                    return min_input_length < length < max_input_length
         | 
| 486 | 
            +
             | 
| 487 | 
            +
                vectorized_datasets["train"] = vectorized_datasets["train"].filter(
         | 
| 488 | 
            +
                    is_audio_in_length_range,
         | 
| 489 | 
            +
                    input_columns=["input_length"],
         | 
| 490 | 
            +
                )
         | 
| 491 | 
            +
             | 
| 492 | 
            +
                # 8. Load Metric
         | 
| 493 | 
            +
                metric = evaluate.load("wer")
         | 
| 494 | 
            +
                do_normalize_eval = data_args.do_normalize_eval
         | 
| 495 | 
            +
             | 
| 496 | 
            +
                def compute_metrics(pred):
         | 
| 497 | 
            +
                    pred_ids = pred.predictions
         | 
| 498 | 
            +
             | 
| 499 | 
            +
                    pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
         | 
| 500 | 
            +
             | 
| 501 | 
            +
                    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
         | 
| 502 | 
            +
                    # we do not want to group tokens when computing the metrics
         | 
| 503 | 
            +
                    label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
         | 
| 504 | 
            +
             | 
| 505 | 
            +
                    if do_normalize_eval:
         | 
| 506 | 
            +
                        pred_str = [normalizer(pred) for pred in pred_str]
         | 
| 507 | 
            +
                        label_str = [normalizer(label) for label in label_str]
         | 
| 508 | 
            +
             | 
| 509 | 
            +
                    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
         | 
| 510 | 
            +
             | 
| 511 | 
            +
                    return {"wer": wer}
         | 
| 512 | 
            +
             | 
| 513 | 
            +
                # 9. Create a single speech processor
         | 
| 514 | 
            +
                if is_main_process(training_args.local_rank):
         | 
| 515 | 
            +
                    # save feature extractor, tokenizer and config
         | 
| 516 | 
            +
                    feature_extractor.save_pretrained(training_args.output_dir)
         | 
| 517 | 
            +
                    tokenizer.save_pretrained(training_args.output_dir)
         | 
| 518 | 
            +
                    config.save_pretrained(training_args.output_dir)
         | 
| 519 | 
            +
             | 
| 520 | 
            +
                processor = AutoProcessor.from_pretrained(training_args.output_dir)
         | 
| 521 | 
            +
             | 
| 522 | 
            +
                # 10. Define data collator
         | 
| 523 | 
            +
                data_collator = DataCollatorSpeechSeq2SeqWithPadding(
         | 
| 524 | 
            +
                    processor=processor,
         | 
| 525 | 
            +
                    decoder_start_token_id=model.config.decoder_start_token_id,
         | 
| 526 | 
            +
                )
         | 
| 527 | 
            +
             | 
| 528 | 
            +
                # 11. Configure Trainer
         | 
| 529 | 
            +
                # Trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
         | 
| 530 | 
            +
                class ShuffleCallback(TrainerCallback):
         | 
| 531 | 
            +
                    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
         | 
| 532 | 
            +
                        if isinstance(train_dataloader.dataset, IterableDatasetShard):
         | 
| 533 | 
            +
                            pass  # set_epoch() is handled by the Trainer
         | 
| 534 | 
            +
                        elif isinstance(train_dataloader.dataset, IterableDataset):
         | 
| 535 | 
            +
                            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
         | 
| 536 | 
            +
             | 
| 537 | 
            +
                # Initialize Trainer
         | 
| 538 | 
            +
                trainer = Seq2SeqTrainer(
         | 
| 539 | 
            +
                    model=model,
         | 
| 540 | 
            +
                    args=training_args,
         | 
| 541 | 
            +
                    train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         | 
| 542 | 
            +
                    eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
         | 
| 543 | 
            +
                    tokenizer=feature_extractor,
         | 
| 544 | 
            +
                    data_collator=data_collator,
         | 
| 545 | 
            +
                    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
         | 
| 546 | 
            +
                    callbacks=[ShuffleCallback()],
         | 
| 547 | 
            +
                )
         | 
| 548 | 
            +
             | 
| 549 | 
            +
                # 12. Training
         | 
| 550 | 
            +
                if training_args.do_train:
         | 
| 551 | 
            +
                    checkpoint = None
         | 
| 552 | 
            +
                    if training_args.resume_from_checkpoint is not None:
         | 
| 553 | 
            +
                        checkpoint = training_args.resume_from_checkpoint
         | 
| 554 | 
            +
                    elif last_checkpoint is not None:
         | 
| 555 | 
            +
                        checkpoint = last_checkpoint
         | 
| 556 | 
            +
                    train_result = trainer.train(resume_from_checkpoint=checkpoint)
         | 
| 557 | 
            +
                    trainer.save_model()  # Saves the feature extractor too for easy upload
         | 
| 558 | 
            +
             | 
| 559 | 
            +
                    metrics = train_result.metrics
         | 
| 560 | 
            +
                    if data_args.max_train_samples:
         | 
| 561 | 
            +
                        metrics["train_samples"] = data_args.max_train_samples
         | 
| 562 | 
            +
                    trainer.log_metrics("train", metrics)
         | 
| 563 | 
            +
                    trainer.save_metrics("train", metrics)
         | 
| 564 | 
            +
                    trainer.save_state()
         | 
| 565 | 
            +
             | 
| 566 | 
            +
                # 13. Evaluation
         | 
| 567 | 
            +
                results = {}
         | 
| 568 | 
            +
                if training_args.do_eval:
         | 
| 569 | 
            +
                    logger.info("*** Evaluate ***")
         | 
| 570 | 
            +
                    metrics = trainer.evaluate(
         | 
| 571 | 
            +
                        metric_key_prefix="eval",
         | 
| 572 | 
            +
                        max_length=training_args.generation_max_length,
         | 
| 573 | 
            +
                        num_beams=training_args.generation_num_beams,
         | 
| 574 | 
            +
                    )
         | 
| 575 | 
            +
                    if data_args.max_eval_samples:
         | 
| 576 | 
            +
                        metrics["eval_samples"] = data_args.max_eval_samples
         | 
| 577 | 
            +
             | 
| 578 | 
            +
                    trainer.log_metrics("eval", metrics)
         | 
| 579 | 
            +
                    trainer.save_metrics("eval", metrics)
         | 
| 580 | 
            +
             | 
| 581 | 
            +
                # 14. Write Training Stats
         | 
| 582 | 
            +
                kwargs = {
         | 
| 583 | 
            +
                    "finetuned_from": model_args.model_name_or_path,
         | 
| 584 | 
            +
                    "tasks": "automatic-speech-recognition",
         | 
| 585 | 
            +
                    "tags": "whisper-event",
         | 
| 586 | 
            +
                }
         | 
| 587 | 
            +
                if data_args.dataset_name is not None:
         | 
| 588 | 
            +
                    kwargs["dataset_tags"] = data_args.dataset_name
         | 
| 589 | 
            +
                    if data_args.dataset_config_name is not None:
         | 
| 590 | 
            +
                        kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
         | 
| 591 | 
            +
                    else:
         | 
| 592 | 
            +
                        kwargs["dataset"] = data_args.dataset_name
         | 
| 593 | 
            +
                    if "common_voice" in data_args.dataset_name:
         | 
| 594 | 
            +
                        kwargs["language"] = data_args.dataset_config_name
         | 
| 595 | 
            +
                    if model_args.model_index_name is not None:
         | 
| 596 | 
            +
                        kwargs["model_name"] = model_args.model_index_name
         | 
| 597 | 
            +
             | 
| 598 | 
            +
                if training_args.push_to_hub:
         | 
| 599 | 
            +
                    trainer.push_to_hub(**kwargs)
         | 
| 600 | 
            +
                else:
         | 
| 601 | 
            +
                    trainer.create_model_card(**kwargs)
         | 
| 602 | 
            +
             | 
| 603 | 
            +
                return results
         | 
| 604 | 
            +
             | 
| 605 | 
            +
             | 
| 606 | 
            +
            if __name__ == "__main__":
         | 
| 607 | 
            +
                main()
         | 
    	
        run_speech_recognition_seq2seq_mixed_mgb2.py
    ADDED
    
    | @@ -0,0 +1,738 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python
         | 
| 2 | 
            +
            # coding=utf-8
         | 
| 3 | 
            +
            # Copyright 2022 The HuggingFace Team. All rights reserved.
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 6 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 7 | 
            +
            # You may obtain a copy of the License at
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 12 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 13 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 14 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 15 | 
            +
            # limitations under the License.
         | 
| 16 | 
            +
            """
         | 
| 17 | 
            +
            Fine-tuning the library models for sequence to sequence speech recognition
         | 
| 18 | 
            +
            with 🤗 Datasets' streaming mode.
         | 
| 19 | 
            +
            """
         | 
| 20 | 
            +
            # You can also adapt this script for your own sequence to sequence speech
         | 
| 21 | 
            +
            # recognition task. Pointers for this are left as comments.
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            import logging
         | 
| 24 | 
            +
            import os
         | 
| 25 | 
            +
            import sys
         | 
| 26 | 
            +
            from dataclasses import dataclass, field
         | 
| 27 | 
            +
            from typing import Any, Dict, List, Optional, Union
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            import datasets
         | 
| 30 | 
            +
            import torch
         | 
| 31 | 
            +
            from datasets import DatasetDict, IterableDatasetDict, interleave_datasets, load_dataset
         | 
| 32 | 
            +
            from torch.utils.data import IterableDataset
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            import evaluate
         | 
| 35 | 
            +
            import transformers
         | 
| 36 | 
            +
            from transformers import (
         | 
| 37 | 
            +
                AutoConfig,
         | 
| 38 | 
            +
                AutoFeatureExtractor,
         | 
| 39 | 
            +
                AutoModelForSpeechSeq2Seq,
         | 
| 40 | 
            +
                AutoProcessor,
         | 
| 41 | 
            +
                AutoTokenizer,
         | 
| 42 | 
            +
                HfArgumentParser,
         | 
| 43 | 
            +
                Seq2SeqTrainer,
         | 
| 44 | 
            +
                Seq2SeqTrainingArguments,
         | 
| 45 | 
            +
                TrainerCallback,
         | 
| 46 | 
            +
                set_seed,
         | 
| 47 | 
            +
            )
         | 
| 48 | 
            +
            from transformers.models.whisper.english_normalizer import BasicTextNormalizer
         | 
| 49 | 
            +
            from transformers.trainer_pt_utils import IterableDatasetShard
         | 
| 50 | 
            +
            from transformers.trainer_utils import get_last_checkpoint, is_main_process
         | 
| 51 | 
            +
            from transformers.utils import check_min_version, send_example_telemetry
         | 
| 52 | 
            +
            from transformers.utils.versions import require_version
         | 
| 53 | 
            +
             | 
| 54 | 
            +
             | 
| 55 | 
            +
            # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
         | 
| 56 | 
            +
            check_min_version("4.25.0.dev0")
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            require_version(
         | 
| 59 | 
            +
                "datasets>=1.18.2",
         | 
| 60 | 
            +
                "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt",
         | 
| 61 | 
            +
            )
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            logger = logging.getLogger(__name__)
         | 
| 64 | 
            +
             | 
| 65 | 
            +
             | 
| 66 | 
            +
            @dataclass
         | 
| 67 | 
            +
            class ModelArguments:
         | 
| 68 | 
            +
                """
         | 
| 69 | 
            +
                Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
         | 
| 70 | 
            +
                """
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                model_name_or_path: str = field(
         | 
| 73 | 
            +
                    metadata={
         | 
| 74 | 
            +
                        "help": "Path to pretrained model or model identifier from huggingface.co/models"
         | 
| 75 | 
            +
                    }
         | 
| 76 | 
            +
                )
         | 
| 77 | 
            +
                config_name: Optional[str] = field(
         | 
| 78 | 
            +
                    default=None,
         | 
| 79 | 
            +
                    metadata={
         | 
| 80 | 
            +
                        "help": "Pretrained config name or path if not the same as model_name"
         | 
| 81 | 
            +
                    },
         | 
| 82 | 
            +
                )
         | 
| 83 | 
            +
                tokenizer_name: Optional[str] = field(
         | 
| 84 | 
            +
                    default=None,
         | 
| 85 | 
            +
                    metadata={
         | 
| 86 | 
            +
                        "help": "Pretrained tokenizer name or path if not the same as model_name"
         | 
| 87 | 
            +
                    },
         | 
| 88 | 
            +
                )
         | 
| 89 | 
            +
                feature_extractor_name: Optional[str] = field(
         | 
| 90 | 
            +
                    default=None,
         | 
| 91 | 
            +
                    metadata={
         | 
| 92 | 
            +
                        "help": "feature extractor name or path if not the same as model_name"
         | 
| 93 | 
            +
                    },
         | 
| 94 | 
            +
                )
         | 
| 95 | 
            +
                cache_dir: Optional[str] = field(
         | 
| 96 | 
            +
                    default=None,
         | 
| 97 | 
            +
                    metadata={
         | 
| 98 | 
            +
                        "help": "Where to store the pretrained models downloaded from huggingface.co"
         | 
| 99 | 
            +
                    },
         | 
| 100 | 
            +
                )
         | 
| 101 | 
            +
                use_fast_tokenizer: bool = field(
         | 
| 102 | 
            +
                    default=True,
         | 
| 103 | 
            +
                    metadata={
         | 
| 104 | 
            +
                        "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
         | 
| 105 | 
            +
                    },
         | 
| 106 | 
            +
                )
         | 
| 107 | 
            +
                model_revision: str = field(
         | 
| 108 | 
            +
                    default="main",
         | 
| 109 | 
            +
                    metadata={
         | 
| 110 | 
            +
                        "help": "The specific model version to use (can be a branch name, tag name or commit id)."
         | 
| 111 | 
            +
                    },
         | 
| 112 | 
            +
                )
         | 
| 113 | 
            +
                use_auth_token: bool = field(
         | 
| 114 | 
            +
                    default=False,
         | 
| 115 | 
            +
                    metadata={
         | 
| 116 | 
            +
                        "help": (
         | 
| 117 | 
            +
                            "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
         | 
| 118 | 
            +
                            "with private models)."
         | 
| 119 | 
            +
                        )
         | 
| 120 | 
            +
                    },
         | 
| 121 | 
            +
                )
         | 
| 122 | 
            +
                freeze_feature_encoder: bool = field(
         | 
| 123 | 
            +
                    default=True,
         | 
| 124 | 
            +
                    metadata={"help": "Whether to freeze the feature encoder layers of the model."},
         | 
| 125 | 
            +
                )
         | 
| 126 | 
            +
                freeze_encoder: bool = field(
         | 
| 127 | 
            +
                    default=False,
         | 
| 128 | 
            +
                    metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."},
         | 
| 129 | 
            +
                )
         | 
| 130 | 
            +
                forced_decoder_ids: List[List[int]] = field(
         | 
| 131 | 
            +
                    default=None,
         | 
| 132 | 
            +
                    metadata={
         | 
| 133 | 
            +
                        "help": (
         | 
| 134 | 
            +
                            "A list of pairs of integers which indicates a mapping from generation indices to token indices "
         | 
| 135 | 
            +
                            "that will be forced before sampling. For example, [[0, 123]] means the first generated token "
         | 
| 136 | 
            +
                            "will always be a token of index 123."
         | 
| 137 | 
            +
                        )
         | 
| 138 | 
            +
                    },
         | 
| 139 | 
            +
                )
         | 
| 140 | 
            +
                suppress_tokens: List[int] = field(
         | 
| 141 | 
            +
                    default=None,
         | 
| 142 | 
            +
                    metadata={"help": "A list of tokens that will be suppressed at generation."},
         | 
| 143 | 
            +
                )
         | 
| 144 | 
            +
                model_index_name: str = field(
         | 
| 145 | 
            +
                    default=None, metadata={"help": "Pretty name for the model card."}
         | 
| 146 | 
            +
                )
         | 
| 147 | 
            +
             | 
| 148 | 
            +
             | 
| 149 | 
            +
            @dataclass
         | 
| 150 | 
            +
            class DataTrainingArguments:
         | 
| 151 | 
            +
                """
         | 
| 152 | 
            +
                Arguments pertaining to what data we are going to input our model for training and eval.
         | 
| 153 | 
            +
                """
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                dataset_name: str = field(
         | 
| 156 | 
            +
                    default=None,
         | 
| 157 | 
            +
                    metadata={"help": "The name of the dataset to use (via the datasets library)."},
         | 
| 158 | 
            +
                )
         | 
| 159 | 
            +
                dataset_config_name: Optional[str] = field(
         | 
| 160 | 
            +
                    default=None,
         | 
| 161 | 
            +
                    metadata={
         | 
| 162 | 
            +
                        "help": "The configuration name of the dataset to use (via the datasets library)."
         | 
| 163 | 
            +
                    },
         | 
| 164 | 
            +
                )
         | 
| 165 | 
            +
                text_column: Optional[str] = field(
         | 
| 166 | 
            +
                    default=None,
         | 
| 167 | 
            +
                    metadata={
         | 
| 168 | 
            +
                        "help": "The name of the column in the datasets containing the full texts (for summarization)."
         | 
| 169 | 
            +
                    },
         | 
| 170 | 
            +
                )
         | 
| 171 | 
            +
                max_train_samples: Optional[int] = field(
         | 
| 172 | 
            +
                    default=None,
         | 
| 173 | 
            +
                    metadata={
         | 
| 174 | 
            +
                        "help": (
         | 
| 175 | 
            +
                            "For debugging purposes or quicker training, truncate the number of training examples to this "
         | 
| 176 | 
            +
                            "value if set."
         | 
| 177 | 
            +
                        )
         | 
| 178 | 
            +
                    },
         | 
| 179 | 
            +
                )
         | 
| 180 | 
            +
                max_eval_samples: Optional[int] = field(
         | 
| 181 | 
            +
                    default=None,
         | 
| 182 | 
            +
                    metadata={
         | 
| 183 | 
            +
                        "help": (
         | 
| 184 | 
            +
                            "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
         | 
| 185 | 
            +
                            "value if set."
         | 
| 186 | 
            +
                        )
         | 
| 187 | 
            +
                    },
         | 
| 188 | 
            +
                )
         | 
| 189 | 
            +
                audio_column_name: str = field(
         | 
| 190 | 
            +
                    default="audio",
         | 
| 191 | 
            +
                    metadata={
         | 
| 192 | 
            +
                        "help": "The name of the dataset column containing the audio data. Defaults to 'audio'"
         | 
| 193 | 
            +
                    },
         | 
| 194 | 
            +
                )
         | 
| 195 | 
            +
                text_column_name: str = field(
         | 
| 196 | 
            +
                    default="text",
         | 
| 197 | 
            +
                    metadata={
         | 
| 198 | 
            +
                        "help": "The name of the dataset column containing the text data. Defaults to 'text'"
         | 
| 199 | 
            +
                    },
         | 
| 200 | 
            +
                )
         | 
| 201 | 
            +
                max_duration_in_seconds: float = field(
         | 
| 202 | 
            +
                    default=20.0,
         | 
| 203 | 
            +
                    metadata={
         | 
| 204 | 
            +
                        "help": (
         | 
| 205 | 
            +
                            "Truncate audio files that are longer than `max_duration_in_seconds` seconds to"
         | 
| 206 | 
            +
                            " 'max_duration_in_seconds`"
         | 
| 207 | 
            +
                        )
         | 
| 208 | 
            +
                    },
         | 
| 209 | 
            +
                )
         | 
| 210 | 
            +
                min_duration_in_seconds: float = field(
         | 
| 211 | 
            +
                    default=0.0,
         | 
| 212 | 
            +
                    metadata={
         | 
| 213 | 
            +
                        "help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"
         | 
| 214 | 
            +
                    },
         | 
| 215 | 
            +
                )
         | 
| 216 | 
            +
                train_split_name: str = field(
         | 
| 217 | 
            +
                    default="train",
         | 
| 218 | 
            +
                    metadata={
         | 
| 219 | 
            +
                        "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
         | 
| 220 | 
            +
                    },
         | 
| 221 | 
            +
                )
         | 
| 222 | 
            +
                eval_split_name: str = field(
         | 
| 223 | 
            +
                    default="test",
         | 
| 224 | 
            +
                    metadata={
         | 
| 225 | 
            +
                        "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
         | 
| 226 | 
            +
                    },
         | 
| 227 | 
            +
                )
         | 
| 228 | 
            +
                do_lower_case: bool = field(
         | 
| 229 | 
            +
                    default=False,
         | 
| 230 | 
            +
                    metadata={"help": "Whether the target text should be lower cased."},
         | 
| 231 | 
            +
                )
         | 
| 232 | 
            +
                do_remove_punctuation: bool = field(
         | 
| 233 | 
            +
                    default=False,
         | 
| 234 | 
            +
                    metadata={"help": "Whether the target text should be striped of punctuation."},
         | 
| 235 | 
            +
                )
         | 
| 236 | 
            +
                do_normalize_eval: bool = field(
         | 
| 237 | 
            +
                    default=True,
         | 
| 238 | 
            +
                    metadata={
         | 
| 239 | 
            +
                        "help": "Whether to normalise the references and predictions in the eval WER calculation."
         | 
| 240 | 
            +
                    },
         | 
| 241 | 
            +
                )
         | 
| 242 | 
            +
                language: str = field(
         | 
| 243 | 
            +
                    default=None,
         | 
| 244 | 
            +
                    metadata={
         | 
| 245 | 
            +
                        "help": (
         | 
| 246 | 
            +
                            "Language for multilingual fine-tuning. This argument should be set for multilingual fine-tuning "
         | 
| 247 | 
            +
                            "only. For English speech recognition, it should be set to `None`."
         | 
| 248 | 
            +
                        )
         | 
| 249 | 
            +
                    },
         | 
| 250 | 
            +
                )
         | 
| 251 | 
            +
                task: str = field(
         | 
| 252 | 
            +
                    default="transcribe",
         | 
| 253 | 
            +
                    metadata={
         | 
| 254 | 
            +
                        "help": "Task, either `transcribe` for speech recognition or `translate` for speech translation."
         | 
| 255 | 
            +
                    },
         | 
| 256 | 
            +
                )
         | 
| 257 | 
            +
                shuffle_buffer_size: Optional[int] = field(
         | 
| 258 | 
            +
                    default=500,
         | 
| 259 | 
            +
                    metadata={
         | 
| 260 | 
            +
                        "help": (
         | 
| 261 | 
            +
                            "The number of streamed examples to download before shuffling them. The large the buffer, "
         | 
| 262 | 
            +
                            "the closer it is to real offline shuffling."
         | 
| 263 | 
            +
                        )
         | 
| 264 | 
            +
                    },
         | 
| 265 | 
            +
                )
         | 
| 266 | 
            +
                streaming: bool = field(
         | 
| 267 | 
            +
                    default=True,
         | 
| 268 | 
            +
                    metadata={
         | 
| 269 | 
            +
                        "help": "Whether to use streaming mode to load and pre-process the data."
         | 
| 270 | 
            +
                    },
         | 
| 271 | 
            +
                )
         | 
| 272 | 
            +
             | 
| 273 | 
            +
             | 
| 274 | 
            +
            @dataclass
         | 
| 275 | 
            +
            class DataCollatorSpeechSeq2SeqWithPadding:
         | 
| 276 | 
            +
                """
         | 
| 277 | 
            +
                Data collator that will dynamically pad the inputs received.
         | 
| 278 | 
            +
                Args:
         | 
| 279 | 
            +
                    processor ([`WhisperProcessor`])
         | 
| 280 | 
            +
                        The processor used for processing the data.
         | 
| 281 | 
            +
                    decoder_start_token_id (`int`)
         | 
| 282 | 
            +
                        The begin-of-sentence of the decoder.
         | 
| 283 | 
            +
                """
         | 
| 284 | 
            +
             | 
| 285 | 
            +
                processor: Any
         | 
| 286 | 
            +
                decoder_start_token_id: int
         | 
| 287 | 
            +
             | 
| 288 | 
            +
                def __call__(
         | 
| 289 | 
            +
                    self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
         | 
| 290 | 
            +
                ) -> Dict[str, torch.Tensor]:
         | 
| 291 | 
            +
                    # split inputs and labels since they have to be of different lengths and need
         | 
| 292 | 
            +
                    # different padding methods
         | 
| 293 | 
            +
                    model_input_name = self.processor.model_input_names[0]
         | 
| 294 | 
            +
                    input_features = [
         | 
| 295 | 
            +
                        {model_input_name: feature[model_input_name]} for feature in features
         | 
| 296 | 
            +
                    ]
         | 
| 297 | 
            +
                    label_features = [{"input_ids": feature["labels"]} for feature in features]
         | 
| 298 | 
            +
             | 
| 299 | 
            +
                    batch = self.processor.feature_extractor.pad(
         | 
| 300 | 
            +
                        input_features, return_tensors="pt"
         | 
| 301 | 
            +
                    )
         | 
| 302 | 
            +
             | 
| 303 | 
            +
                    labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
         | 
| 304 | 
            +
             | 
| 305 | 
            +
                    # replace padding with -100 to ignore loss correctly
         | 
| 306 | 
            +
                    labels = labels_batch["input_ids"].masked_fill(
         | 
| 307 | 
            +
                        labels_batch.attention_mask.ne(1), -100
         | 
| 308 | 
            +
                    )
         | 
| 309 | 
            +
             | 
| 310 | 
            +
                    # if bos token is appended in previous tokenization step,
         | 
| 311 | 
            +
                    # cut bos token here as it's append later anyways
         | 
| 312 | 
            +
                    if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
         | 
| 313 | 
            +
                        labels = labels[:, 1:]
         | 
| 314 | 
            +
             | 
| 315 | 
            +
                    batch["labels"] = labels
         | 
| 316 | 
            +
             | 
| 317 | 
            +
                    return batch
         | 
| 318 | 
            +
             | 
| 319 | 
            +
             | 
| 320 | 
            +
            def load_maybe_streaming_dataset(
         | 
| 321 | 
            +
                dataset_name, dataset_config_name, split="train", streaming=True, **kwargs
         | 
| 322 | 
            +
            ):
         | 
| 323 | 
            +
                """
         | 
| 324 | 
            +
                Utility function to load a dataset in streaming mode. For datasets with multiple splits,
         | 
| 325 | 
            +
                each split is loaded individually and then splits combined by taking alternating examples from
         | 
| 326 | 
            +
                each (interleaving).
         | 
| 327 | 
            +
                """
         | 
| 328 | 
            +
                if "+" in split:
         | 
| 329 | 
            +
                    # load multiple splits separated by the `+` symbol with streaming mode
         | 
| 330 | 
            +
                    dataset_splits = [
         | 
| 331 | 
            +
                        load_dataset(
         | 
| 332 | 
            +
                            dataset_name,
         | 
| 333 | 
            +
                            dataset_config_name,
         | 
| 334 | 
            +
                            split=split_name,
         | 
| 335 | 
            +
                            streaming=streaming,
         | 
| 336 | 
            +
                            **kwargs,
         | 
| 337 | 
            +
                        )
         | 
| 338 | 
            +
                        for split_name in split.split("+")
         | 
| 339 | 
            +
                    ]
         | 
| 340 | 
            +
                    # interleave multiple splits to form one dataset
         | 
| 341 | 
            +
                    interleaved_dataset = interleave_datasets(dataset_splits)
         | 
| 342 | 
            +
                    return interleaved_dataset
         | 
| 343 | 
            +
                else:
         | 
| 344 | 
            +
                    # load a single split *with* streaming mode
         | 
| 345 | 
            +
                    dataset = load_dataset(
         | 
| 346 | 
            +
                        dataset_name,
         | 
| 347 | 
            +
                        dataset_config_name,
         | 
| 348 | 
            +
                        split=split,
         | 
| 349 | 
            +
                        streaming=streaming,
         | 
| 350 | 
            +
                        **kwargs,
         | 
| 351 | 
            +
                    )
         | 
| 352 | 
            +
                    return dataset
         | 
| 353 | 
            +
             | 
| 354 | 
            +
             | 
| 355 | 
            +
            def main():
         | 
| 356 | 
            +
                # 1. Parse input arguments
         | 
| 357 | 
            +
                # See all possible arguments in src/transformers/training_args.py
         | 
| 358 | 
            +
                # or by passing the --help flag to this script.
         | 
| 359 | 
            +
                # We now keep distinct sets of args, for a cleaner separation of concerns.
         | 
| 360 | 
            +
                parser = HfArgumentParser(
         | 
| 361 | 
            +
                    (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)
         | 
| 362 | 
            +
                )
         | 
| 363 | 
            +
             | 
| 364 | 
            +
                if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         | 
| 365 | 
            +
                    # If we pass only one argument to the script and it's the path to a json file,
         | 
| 366 | 
            +
                    # let's parse it to get our arguments.
         | 
| 367 | 
            +
                    model_args, data_args, training_args = parser.parse_json_file(
         | 
| 368 | 
            +
                        json_file=os.path.abspath(sys.argv[1])
         | 
| 369 | 
            +
                    )
         | 
| 370 | 
            +
                else:
         | 
| 371 | 
            +
                    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
         | 
| 372 | 
            +
             | 
| 373 | 
            +
                # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
         | 
| 374 | 
            +
                # information sent is the one passed as arguments along with your Python/PyTorch versions.
         | 
| 375 | 
            +
                send_example_telemetry(
         | 
| 376 | 
            +
                    "run_speech_recognition_seq2seq_streaming", model_args, data_args
         | 
| 377 | 
            +
                )
         | 
| 378 | 
            +
             | 
| 379 | 
            +
                # 2. Setup logging
         | 
| 380 | 
            +
                logging.basicConfig(
         | 
| 381 | 
            +
                    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         | 
| 382 | 
            +
                    datefmt="%m/%d/%Y %H:%M:%S",
         | 
| 383 | 
            +
                    handlers=[logging.StreamHandler(sys.stdout)],
         | 
| 384 | 
            +
                )
         | 
| 385 | 
            +
                log_level = training_args.get_process_log_level()
         | 
| 386 | 
            +
                logger.setLevel(log_level)
         | 
| 387 | 
            +
                datasets.utils.logging.set_verbosity(log_level)
         | 
| 388 | 
            +
                transformers.utils.logging.set_verbosity(log_level)
         | 
| 389 | 
            +
                transformers.utils.logging.enable_default_handler()
         | 
| 390 | 
            +
                transformers.utils.logging.enable_explicit_format()
         | 
| 391 | 
            +
             | 
| 392 | 
            +
                logger.setLevel(
         | 
| 393 | 
            +
                    logging.INFO if is_main_process(training_args.local_rank) else logging.WARN
         | 
| 394 | 
            +
                )
         | 
| 395 | 
            +
             | 
| 396 | 
            +
                # Log on each process the small summary:
         | 
| 397 | 
            +
                logger.warning(
         | 
| 398 | 
            +
                    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
         | 
| 399 | 
            +
                    f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
         | 
| 400 | 
            +
                )
         | 
| 401 | 
            +
                logger.info(f"Training/evaluation parameters {training_args}")
         | 
| 402 | 
            +
             | 
| 403 | 
            +
                # Set the verbosity to info of the Transformers logger (on main process only):
         | 
| 404 | 
            +
                if is_main_process(training_args.local_rank):
         | 
| 405 | 
            +
                    transformers.utils.logging.set_verbosity_info()
         | 
| 406 | 
            +
                logger.info("Training/evaluation parameters %s", training_args)
         | 
| 407 | 
            +
             | 
| 408 | 
            +
                # 3. Detecting last checkpoint and eventually continue from last checkpoint
         | 
| 409 | 
            +
                last_checkpoint = None
         | 
| 410 | 
            +
                if (
         | 
| 411 | 
            +
                    os.path.isdir(training_args.output_dir)
         | 
| 412 | 
            +
                    and training_args.do_train
         | 
| 413 | 
            +
                    and not training_args.overwrite_output_dir
         | 
| 414 | 
            +
                ):
         | 
| 415 | 
            +
                    last_checkpoint = get_last_checkpoint(training_args.output_dir)
         | 
| 416 | 
            +
                    if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
         | 
| 417 | 
            +
                        raise ValueError(
         | 
| 418 | 
            +
                            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
         | 
| 419 | 
            +
                            "Use --overwrite_output_dir to overcome."
         | 
| 420 | 
            +
                        )
         | 
| 421 | 
            +
                    elif (
         | 
| 422 | 
            +
                        last_checkpoint is not None and training_args.resume_from_checkpoint is None
         | 
| 423 | 
            +
                    ):
         | 
| 424 | 
            +
                        logger.info(
         | 
| 425 | 
            +
                            f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
         | 
| 426 | 
            +
                            "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
         | 
| 427 | 
            +
                        )
         | 
| 428 | 
            +
             | 
| 429 | 
            +
                # Set seed before initializing model.
         | 
| 430 | 
            +
                set_seed(training_args.seed)
         | 
| 431 | 
            +
             | 
| 432 | 
            +
                # 4. Load dataset
         | 
| 433 | 
            +
                raw_datasets = IterableDatasetDict()
         | 
| 434 | 
            +
             | 
| 435 | 
            +
                if training_args.do_train:
         | 
| 436 | 
            +
                    raw_datasets["train"] = load_maybe_streaming_dataset(
         | 
| 437 | 
            +
                        data_args.dataset_name,
         | 
| 438 | 
            +
                        data_args.dataset_config_name,
         | 
| 439 | 
            +
                        split=data_args.train_split_name,
         | 
| 440 | 
            +
                        streaming=True,
         | 
| 441 | 
            +
                        use_auth_token=True if model_args.use_auth_token else None,
         | 
| 442 | 
            +
                    )
         | 
| 443 | 
            +
             | 
| 444 | 
            +
                if training_args.do_eval:
         | 
| 445 | 
            +
                    raw_datasets["eval"] = load_maybe_streaming_dataset(
         | 
| 446 | 
            +
                        "arbml/mgb3",
         | 
| 447 | 
            +
                        data_args.dataset_config_name,
         | 
| 448 | 
            +
                        split="train",
         | 
| 449 | 
            +
                        streaming=False,
         | 
| 450 | 
            +
                        use_auth_token=True if model_args.use_auth_token else None,
         | 
| 451 | 
            +
                    )
         | 
| 452 | 
            +
             | 
| 453 | 
            +
                raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
         | 
| 454 | 
            +
             | 
| 455 | 
            +
                if data_args.audio_column_name not in raw_datasets_features:
         | 
| 456 | 
            +
                    raise ValueError(
         | 
| 457 | 
            +
                        f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
         | 
| 458 | 
            +
                        "Make sure to set `--audio_column_name` to the correct audio column - one of "
         | 
| 459 | 
            +
                        f"{', '.join(raw_datasets_features)}."
         | 
| 460 | 
            +
                    )
         | 
| 461 | 
            +
             | 
| 462 | 
            +
                if data_args.text_column_name not in raw_datasets_features:
         | 
| 463 | 
            +
                    raise ValueError(
         | 
| 464 | 
            +
                        f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
         | 
| 465 | 
            +
                        "Make sure to set `--text_column_name` to the correct text column - one of "
         | 
| 466 | 
            +
                        f"{', '.join(raw_datasets_features)}."
         | 
| 467 | 
            +
                    )
         | 
| 468 | 
            +
             | 
| 469 | 
            +
                # 5. Load pretrained model, tokenizer, and feature extractor
         | 
| 470 | 
            +
                #
         | 
| 471 | 
            +
                # Distributed training:
         | 
| 472 | 
            +
                # The .from_pretrained methods guarantee that only one local process can concurrently
         | 
| 473 | 
            +
                config = AutoConfig.from_pretrained(
         | 
| 474 | 
            +
                    model_args.config_name
         | 
| 475 | 
            +
                    if model_args.config_name
         | 
| 476 | 
            +
                    else model_args.model_name_or_path,
         | 
| 477 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 478 | 
            +
                    revision=model_args.model_revision,
         | 
| 479 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 480 | 
            +
                )
         | 
| 481 | 
            +
             | 
| 482 | 
            +
                config.update(
         | 
| 483 | 
            +
                    {
         | 
| 484 | 
            +
                        "forced_decoder_ids": model_args.forced_decoder_ids,
         | 
| 485 | 
            +
                        "suppress_tokens": model_args.suppress_tokens,
         | 
| 486 | 
            +
                    }
         | 
| 487 | 
            +
                )
         | 
| 488 | 
            +
             | 
| 489 | 
            +
                if training_args.gradient_checkpointing:
         | 
| 490 | 
            +
                    config.update({"use_cache": False})
         | 
| 491 | 
            +
             | 
| 492 | 
            +
                feature_extractor = AutoFeatureExtractor.from_pretrained(
         | 
| 493 | 
            +
                    model_args.feature_extractor_name
         | 
| 494 | 
            +
                    if model_args.feature_extractor_name
         | 
| 495 | 
            +
                    else model_args.model_name_or_path,
         | 
| 496 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 497 | 
            +
                    revision=model_args.model_revision,
         | 
| 498 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 499 | 
            +
                )
         | 
| 500 | 
            +
                tokenizer = AutoTokenizer.from_pretrained(
         | 
| 501 | 
            +
                    model_args.tokenizer_name
         | 
| 502 | 
            +
                    if model_args.tokenizer_name
         | 
| 503 | 
            +
                    else model_args.model_name_or_path,
         | 
| 504 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 505 | 
            +
                    use_fast=model_args.use_fast_tokenizer,
         | 
| 506 | 
            +
                    revision=model_args.model_revision,
         | 
| 507 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 508 | 
            +
                )
         | 
| 509 | 
            +
                model = AutoModelForSpeechSeq2Seq.from_pretrained(
         | 
| 510 | 
            +
                    model_args.model_name_or_path,
         | 
| 511 | 
            +
                    config=config,
         | 
| 512 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 513 | 
            +
                    revision=model_args.model_revision,
         | 
| 514 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 515 | 
            +
                )
         | 
| 516 | 
            +
             | 
| 517 | 
            +
                if model.config.decoder_start_token_id is None:
         | 
| 518 | 
            +
                    raise ValueError(
         | 
| 519 | 
            +
                        "Make sure that `config.decoder_start_token_id` is correctly defined"
         | 
| 520 | 
            +
                    )
         | 
| 521 | 
            +
             | 
| 522 | 
            +
                if model_args.freeze_feature_encoder:
         | 
| 523 | 
            +
                    model.freeze_feature_encoder()
         | 
| 524 | 
            +
             | 
| 525 | 
            +
                if model_args.freeze_encoder:
         | 
| 526 | 
            +
                    model.freeze_encoder()
         | 
| 527 | 
            +
                    model.model.encoder.gradient_checkpointing = False
         | 
| 528 | 
            +
             | 
| 529 | 
            +
                if data_args.language is not None:
         | 
| 530 | 
            +
                    # We only need to set the task id when the language is specified (i.e. in a multilingual setting)
         | 
| 531 | 
            +
                    tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
         | 
| 532 | 
            +
             | 
| 533 | 
            +
                # 6. Resample speech dataset if necessary
         | 
| 534 | 
            +
                dataset_sampling_rate = (
         | 
| 535 | 
            +
                    next(iter(raw_datasets.values()))
         | 
| 536 | 
            +
                    .features[data_args.audio_column_name]
         | 
| 537 | 
            +
                    .sampling_rate
         | 
| 538 | 
            +
                )
         | 
| 539 | 
            +
                if dataset_sampling_rate != feature_extractor.sampling_rate:
         | 
| 540 | 
            +
                    raw_datasets = raw_datasets.cast_column(
         | 
| 541 | 
            +
                        data_args.audio_column_name,
         | 
| 542 | 
            +
                        datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
         | 
| 543 | 
            +
                    )
         | 
| 544 | 
            +
             | 
| 545 | 
            +
                # 7. Preprocessing the datasets.
         | 
| 546 | 
            +
                # We need to read the audio files as arrays and tokenize the targets.
         | 
| 547 | 
            +
                max_input_length = (
         | 
| 548 | 
            +
                    data_args.max_duration_in_seconds * feature_extractor.sampling_rate
         | 
| 549 | 
            +
                )
         | 
| 550 | 
            +
                min_input_length = (
         | 
| 551 | 
            +
                    data_args.min_duration_in_seconds * feature_extractor.sampling_rate
         | 
| 552 | 
            +
                )
         | 
| 553 | 
            +
                audio_column_name = data_args.audio_column_name
         | 
| 554 | 
            +
                text_column_name = data_args.text_column_name
         | 
| 555 | 
            +
                model_input_name = feature_extractor.model_input_names[0]
         | 
| 556 | 
            +
                do_lower_case = data_args.do_lower_case
         | 
| 557 | 
            +
                do_remove_punctuation = data_args.do_remove_punctuation
         | 
| 558 | 
            +
                normalizer = BasicTextNormalizer()  # 'official' text normalizer from OpenAI
         | 
| 559 | 
            +
             | 
| 560 | 
            +
                if data_args.max_train_samples is not None:
         | 
| 561 | 
            +
                    raw_datasets["train"] = raw_datasets["train"].take(data_args.max_train_samples)
         | 
| 562 | 
            +
             | 
| 563 | 
            +
                if data_args.max_eval_samples is not None:
         | 
| 564 | 
            +
                    raw_datasets["eval"] = raw_datasets["eval"].select(
         | 
| 565 | 
            +
                        range(data_args.max_eval_samples)
         | 
| 566 | 
            +
                    )
         | 
| 567 | 
            +
             | 
| 568 | 
            +
                def prepare_dataset(batch):
         | 
| 569 | 
            +
                    # process audio
         | 
| 570 | 
            +
                    sample = batch[audio_column_name]
         | 
| 571 | 
            +
                    inputs = feature_extractor(
         | 
| 572 | 
            +
                        sample["array"], sampling_rate=sample["sampling_rate"]
         | 
| 573 | 
            +
                    )
         | 
| 574 | 
            +
                    # process audio length
         | 
| 575 | 
            +
                    batch[model_input_name] = inputs.get(model_input_name)[0]
         | 
| 576 | 
            +
                    batch["input_length"] = len(sample["array"])
         | 
| 577 | 
            +
             | 
| 578 | 
            +
                    # process targets
         | 
| 579 | 
            +
                    input_str = (
         | 
| 580 | 
            +
                        batch[text_column_name].lower()
         | 
| 581 | 
            +
                        if do_lower_case
         | 
| 582 | 
            +
                        else batch[text_column_name]
         | 
| 583 | 
            +
                    )
         | 
| 584 | 
            +
                    if do_remove_punctuation:
         | 
| 585 | 
            +
                        input_str = normalizer(input_str).strip()
         | 
| 586 | 
            +
                    batch["labels"] = tokenizer(input_str).input_ids
         | 
| 587 | 
            +
                    return batch
         | 
| 588 | 
            +
             | 
| 589 | 
            +
                with training_args.main_process_first(desc="dataset map pre-processing"):
         | 
| 590 | 
            +
                    vectorized_datasets = raw_datasets.map(
         | 
| 591 | 
            +
                        prepare_dataset,
         | 
| 592 | 
            +
                        remove_columns=raw_datasets_features,
         | 
| 593 | 
            +
                    ).with_format("torch")
         | 
| 594 | 
            +
             | 
| 595 | 
            +
                    if training_args.do_train:
         | 
| 596 | 
            +
                        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(
         | 
| 597 | 
            +
                            buffer_size=data_args.shuffle_buffer_size,
         | 
| 598 | 
            +
                            seed=training_args.seed,
         | 
| 599 | 
            +
                        )
         | 
| 600 | 
            +
             | 
| 601 | 
            +
                # filter training data that is shorter than min_input_length or longer than
         | 
| 602 | 
            +
                # max_input_length
         | 
| 603 | 
            +
                def is_audio_in_length_range(length):
         | 
| 604 | 
            +
                    return min_input_length < length < max_input_length
         | 
| 605 | 
            +
             | 
| 606 | 
            +
                vectorized_datasets["train"] = vectorized_datasets["train"].filter(
         | 
| 607 | 
            +
                    is_audio_in_length_range,
         | 
| 608 | 
            +
                    input_columns=["input_length"],
         | 
| 609 | 
            +
                )
         | 
| 610 | 
            +
             | 
| 611 | 
            +
                # 8. Load Metric
         | 
| 612 | 
            +
                metric = evaluate.load("wer")
         | 
| 613 | 
            +
                do_normalize_eval = data_args.do_normalize_eval
         | 
| 614 | 
            +
             | 
| 615 | 
            +
                def compute_metrics(pred):
         | 
| 616 | 
            +
                    pred_ids = pred.predictions
         | 
| 617 | 
            +
             | 
| 618 | 
            +
                    pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
         | 
| 619 | 
            +
             | 
| 620 | 
            +
                    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
         | 
| 621 | 
            +
                    # we do not want to group tokens when computing the metrics
         | 
| 622 | 
            +
                    label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
         | 
| 623 | 
            +
             | 
| 624 | 
            +
                    if do_normalize_eval:
         | 
| 625 | 
            +
                        pred_str = [normalizer(pred) for pred in pred_str]
         | 
| 626 | 
            +
                        label_str = [normalizer(label) for label in label_str]
         | 
| 627 | 
            +
                        # filtering step to only evaluate the samples that correspond to non-zero references:
         | 
| 628 | 
            +
                        pred_str = [
         | 
| 629 | 
            +
                            pred_str[i] for i in range(len(pred_str)) if len(label_str[i]) > 0
         | 
| 630 | 
            +
                        ]
         | 
| 631 | 
            +
                        label_str = [
         | 
| 632 | 
            +
                            label_str[i] for i in range(len(label_str)) if len(label_str[i]) > 0
         | 
| 633 | 
            +
                        ]
         | 
| 634 | 
            +
             | 
| 635 | 
            +
                    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
         | 
| 636 | 
            +
             | 
| 637 | 
            +
                    return {"wer": wer}
         | 
| 638 | 
            +
             | 
| 639 | 
            +
                # 9. Create a single speech processor
         | 
| 640 | 
            +
                if is_main_process(training_args.local_rank):
         | 
| 641 | 
            +
                    # save feature extractor, tokenizer and config
         | 
| 642 | 
            +
                    feature_extractor.save_pretrained(training_args.output_dir)
         | 
| 643 | 
            +
                    tokenizer.save_pretrained(training_args.output_dir)
         | 
| 644 | 
            +
                    config.save_pretrained(training_args.output_dir)
         | 
| 645 | 
            +
             | 
| 646 | 
            +
                processor = AutoProcessor.from_pretrained(training_args.output_dir)
         | 
| 647 | 
            +
             | 
| 648 | 
            +
                # 10. Define data collator
         | 
| 649 | 
            +
                data_collator = DataCollatorSpeechSeq2SeqWithPadding(
         | 
| 650 | 
            +
                    processor=processor,
         | 
| 651 | 
            +
                    decoder_start_token_id=model.config.decoder_start_token_id,
         | 
| 652 | 
            +
                )
         | 
| 653 | 
            +
             | 
| 654 | 
            +
                # 11. Configure Trainer
         | 
| 655 | 
            +
                # Trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
         | 
| 656 | 
            +
                # Only required for streaming: Trainer automatically shuffles non-streaming datasets
         | 
| 657 | 
            +
                class ShuffleCallback(TrainerCallback):
         | 
| 658 | 
            +
                    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
         | 
| 659 | 
            +
                        if isinstance(train_dataloader.dataset, IterableDatasetShard):
         | 
| 660 | 
            +
                            pass  # set_epoch() is handled by the Trainer
         | 
| 661 | 
            +
                        elif isinstance(train_dataloader.dataset, IterableDataset):
         | 
| 662 | 
            +
                            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
         | 
| 663 | 
            +
             | 
| 664 | 
            +
                # Initialize Trainer
         | 
| 665 | 
            +
                trainer = Seq2SeqTrainer(
         | 
| 666 | 
            +
                    model=model,
         | 
| 667 | 
            +
                    args=training_args,
         | 
| 668 | 
            +
                    train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         | 
| 669 | 
            +
                    eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
         | 
| 670 | 
            +
                    tokenizer=feature_extractor,
         | 
| 671 | 
            +
                    data_collator=data_collator,
         | 
| 672 | 
            +
                    compute_metrics=compute_metrics
         | 
| 673 | 
            +
                    if training_args.predict_with_generate
         | 
| 674 | 
            +
                    else None,
         | 
| 675 | 
            +
                    callbacks=[ShuffleCallback()],
         | 
| 676 | 
            +
                )
         | 
| 677 | 
            +
             | 
| 678 | 
            +
                # 12. Training
         | 
| 679 | 
            +
                if training_args.do_train:
         | 
| 680 | 
            +
                    checkpoint = None
         | 
| 681 | 
            +
                    if training_args.resume_from_checkpoint is not None:
         | 
| 682 | 
            +
                        checkpoint = training_args.resume_from_checkpoint
         | 
| 683 | 
            +
                    elif last_checkpoint is not None:
         | 
| 684 | 
            +
                        checkpoint = last_checkpoint
         | 
| 685 | 
            +
                    train_result = trainer.train(resume_from_checkpoint=checkpoint)
         | 
| 686 | 
            +
                    trainer.save_model()  # Saves the feature extractor too for easy upload
         | 
| 687 | 
            +
             | 
| 688 | 
            +
                    metrics = train_result.metrics
         | 
| 689 | 
            +
                    if data_args.max_train_samples:
         | 
| 690 | 
            +
                        metrics["train_samples"] = data_args.max_train_samples
         | 
| 691 | 
            +
                    trainer.log_metrics("train", metrics)
         | 
| 692 | 
            +
                    trainer.save_metrics("train", metrics)
         | 
| 693 | 
            +
                    trainer.save_state()
         | 
| 694 | 
            +
             | 
| 695 | 
            +
                # 13. Evaluation
         | 
| 696 | 
            +
                results = {}
         | 
| 697 | 
            +
                if training_args.do_eval:
         | 
| 698 | 
            +
                    logger.info("*** Evaluate ***")
         | 
| 699 | 
            +
                    metrics = trainer.evaluate(
         | 
| 700 | 
            +
                        metric_key_prefix="eval",
         | 
| 701 | 
            +
                        max_length=training_args.generation_max_length,
         | 
| 702 | 
            +
                        num_beams=training_args.generation_num_beams,
         | 
| 703 | 
            +
                    )
         | 
| 704 | 
            +
                    if data_args.max_eval_samples:
         | 
| 705 | 
            +
                        metrics["eval_samples"] = data_args.max_eval_samples
         | 
| 706 | 
            +
             | 
| 707 | 
            +
                    trainer.log_metrics("eval", metrics)
         | 
| 708 | 
            +
                    trainer.save_metrics("eval", metrics)
         | 
| 709 | 
            +
             | 
| 710 | 
            +
                # 14. Write Training Stats
         | 
| 711 | 
            +
                kwargs = {
         | 
| 712 | 
            +
                    "finetuned_from": model_args.model_name_or_path,
         | 
| 713 | 
            +
                    "tasks": "automatic-speech-recognition",
         | 
| 714 | 
            +
                    "tags": "whisper-event",
         | 
| 715 | 
            +
                }
         | 
| 716 | 
            +
                if data_args.dataset_name is not None:
         | 
| 717 | 
            +
                    kwargs["dataset_tags"] = data_args.dataset_name
         | 
| 718 | 
            +
                    if data_args.dataset_config_name is not None:
         | 
| 719 | 
            +
                        kwargs[
         | 
| 720 | 
            +
                            "dataset"
         | 
| 721 | 
            +
                        ] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
         | 
| 722 | 
            +
                    else:
         | 
| 723 | 
            +
                        kwargs["dataset"] = data_args.dataset_name
         | 
| 724 | 
            +
                    if "common_voice" in data_args.dataset_name:
         | 
| 725 | 
            +
                        kwargs["language"] = data_args.dataset_config_name[:2]
         | 
| 726 | 
            +
                    if model_args.model_index_name is not None:
         | 
| 727 | 
            +
                        kwargs["model_name"] = model_args.model_index_name
         | 
| 728 | 
            +
             | 
| 729 | 
            +
                if training_args.push_to_hub:
         | 
| 730 | 
            +
                    trainer.push_to_hub(**kwargs)
         | 
| 731 | 
            +
                else:
         | 
| 732 | 
            +
                    trainer.create_model_card(**kwargs)
         | 
| 733 | 
            +
             | 
| 734 | 
            +
                return results
         | 
| 735 | 
            +
             | 
| 736 | 
            +
             | 
| 737 | 
            +
            if __name__ == "__main__":
         | 
| 738 | 
            +
                main()
         | 
    	
        run_speech_recognition_seq2seq_streaming.py
    ADDED
    
    | @@ -0,0 +1,608 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python
         | 
| 2 | 
            +
            # coding=utf-8
         | 
| 3 | 
            +
            # Copyright 2022 The HuggingFace Team. All rights reserved.
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 6 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 7 | 
            +
            # You may obtain a copy of the License at
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 12 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 13 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 14 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 15 | 
            +
            # limitations under the License.
         | 
| 16 | 
            +
            """
         | 
| 17 | 
            +
            Fine-tuning the library models for sequence to sequence speech recognition
         | 
| 18 | 
            +
            with 🤗 Datasets' streaming mode.
         | 
| 19 | 
            +
            """
         | 
| 20 | 
            +
            # You can also adapt this script for your own sequence to sequence speech
         | 
| 21 | 
            +
            # recognition task. Pointers for this are left as comments.
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            import logging
         | 
| 24 | 
            +
            import os
         | 
| 25 | 
            +
            import sys
         | 
| 26 | 
            +
            from dataclasses import dataclass, field
         | 
| 27 | 
            +
            from typing import Any, Dict, List, Optional, Union
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            import datasets
         | 
| 30 | 
            +
            import torch
         | 
| 31 | 
            +
            from datasets import IterableDatasetDict, interleave_datasets, load_dataset
         | 
| 32 | 
            +
            from torch.utils.data import IterableDataset
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            import evaluate
         | 
| 35 | 
            +
            import transformers
         | 
| 36 | 
            +
            from transformers import (
         | 
| 37 | 
            +
                AutoConfig,
         | 
| 38 | 
            +
                AutoFeatureExtractor,
         | 
| 39 | 
            +
                AutoModelForSpeechSeq2Seq,
         | 
| 40 | 
            +
                AutoProcessor,
         | 
| 41 | 
            +
                AutoTokenizer,
         | 
| 42 | 
            +
                HfArgumentParser,
         | 
| 43 | 
            +
                Seq2SeqTrainer,
         | 
| 44 | 
            +
                Seq2SeqTrainingArguments,
         | 
| 45 | 
            +
                TrainerCallback,
         | 
| 46 | 
            +
                set_seed,
         | 
| 47 | 
            +
            )
         | 
| 48 | 
            +
            from transformers.trainer_pt_utils import IterableDatasetShard
         | 
| 49 | 
            +
            from transformers.trainer_utils import get_last_checkpoint, is_main_process
         | 
| 50 | 
            +
            from transformers.utils import check_min_version, send_example_telemetry
         | 
| 51 | 
            +
            from transformers.utils.versions import require_version
         | 
| 52 | 
            +
            from transformers.models.whisper.english_normalizer import BasicTextNormalizer
         | 
| 53 | 
            +
            import os
         | 
| 54 | 
            +
            os.environ['LD_LIBRARY_PATH'] = '/usr/lib/x86_64-linux-gnu'
         | 
| 55 | 
            +
            # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
         | 
| 56 | 
            +
            check_min_version("4.25.0.dev0")
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            require_version("datasets>=1.18.2", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            logger = logging.getLogger(__name__)
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
            @dataclass
         | 
| 64 | 
            +
            class ModelArguments:
         | 
| 65 | 
            +
                """
         | 
| 66 | 
            +
                Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
         | 
| 67 | 
            +
                """
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                model_name_or_path: str = field(
         | 
| 70 | 
            +
                    metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
         | 
| 71 | 
            +
                )
         | 
| 72 | 
            +
                config_name: Optional[str] = field(
         | 
| 73 | 
            +
                    default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
         | 
| 74 | 
            +
                )
         | 
| 75 | 
            +
                tokenizer_name: Optional[str] = field(
         | 
| 76 | 
            +
                    default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
         | 
| 77 | 
            +
                )
         | 
| 78 | 
            +
                feature_extractor_name: Optional[str] = field(
         | 
| 79 | 
            +
                    default=None, metadata={"help": "feature extractor name or path if not the same as model_name"}
         | 
| 80 | 
            +
                )
         | 
| 81 | 
            +
                cache_dir: Optional[str] = field(
         | 
| 82 | 
            +
                    default=None,
         | 
| 83 | 
            +
                    metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
         | 
| 84 | 
            +
                )
         | 
| 85 | 
            +
                use_fast_tokenizer: bool = field(
         | 
| 86 | 
            +
                    default=True,
         | 
| 87 | 
            +
                    metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
         | 
| 88 | 
            +
                )
         | 
| 89 | 
            +
                model_revision: str = field(
         | 
| 90 | 
            +
                    default="main",
         | 
| 91 | 
            +
                    metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
         | 
| 92 | 
            +
                )
         | 
| 93 | 
            +
                use_auth_token: bool = field(
         | 
| 94 | 
            +
                    default=False,
         | 
| 95 | 
            +
                    metadata={
         | 
| 96 | 
            +
                        "help": (
         | 
| 97 | 
            +
                            "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
         | 
| 98 | 
            +
                            "with private models)."
         | 
| 99 | 
            +
                        )
         | 
| 100 | 
            +
                    },
         | 
| 101 | 
            +
                )
         | 
| 102 | 
            +
                freeze_feature_encoder: bool = field(
         | 
| 103 | 
            +
                    default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
         | 
| 104 | 
            +
                )
         | 
| 105 | 
            +
                freeze_encoder: bool = field(
         | 
| 106 | 
            +
                    default=False, metadata={"help": "Whether to freeze the entire encoder of the seq2seq model."}
         | 
| 107 | 
            +
                )
         | 
| 108 | 
            +
                forced_decoder_ids: List[List[int]] = field(
         | 
| 109 | 
            +
                    default=None,
         | 
| 110 | 
            +
                    metadata={
         | 
| 111 | 
            +
                        "help": (
         | 
| 112 | 
            +
                            "A list of pairs of integers which indicates a mapping from generation indices to token indices "
         | 
| 113 | 
            +
                            "that will be forced before sampling. For example, [[0, 123]] means the first generated token "
         | 
| 114 | 
            +
                            "will always be a token of index 123."
         | 
| 115 | 
            +
                        )
         | 
| 116 | 
            +
                    },
         | 
| 117 | 
            +
                )
         | 
| 118 | 
            +
                suppress_tokens: List[int] = field(
         | 
| 119 | 
            +
                    default=None, metadata={"help": "A list of tokens that will be suppressed at generation."}
         | 
| 120 | 
            +
                )
         | 
| 121 | 
            +
                model_index_name: str = field(default=None, metadata={"help": "Pretty name for the model card."})
         | 
| 122 | 
            +
             | 
| 123 | 
            +
             | 
| 124 | 
            +
            @dataclass
         | 
| 125 | 
            +
            class DataTrainingArguments:
         | 
| 126 | 
            +
                """
         | 
| 127 | 
            +
                Arguments pertaining to what data we are going to input our model for training and eval.
         | 
| 128 | 
            +
                """
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                dataset_name: str = field(
         | 
| 131 | 
            +
                    default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
         | 
| 132 | 
            +
                )
         | 
| 133 | 
            +
                dataset_config_name: Optional[str] = field(
         | 
| 134 | 
            +
                    default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
         | 
| 135 | 
            +
                )
         | 
| 136 | 
            +
                text_column: Optional[str] = field(
         | 
| 137 | 
            +
                    default=None,
         | 
| 138 | 
            +
                    metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
         | 
| 139 | 
            +
                )
         | 
| 140 | 
            +
                max_train_samples: Optional[int] = field(
         | 
| 141 | 
            +
                    default=None,
         | 
| 142 | 
            +
                    metadata={
         | 
| 143 | 
            +
                        "help": (
         | 
| 144 | 
            +
                            "For debugging purposes or quicker training, truncate the number of training examples to this "
         | 
| 145 | 
            +
                            "value if set."
         | 
| 146 | 
            +
                        )
         | 
| 147 | 
            +
                    },
         | 
| 148 | 
            +
                )
         | 
| 149 | 
            +
                max_eval_samples: Optional[int] = field(
         | 
| 150 | 
            +
                    default=None,
         | 
| 151 | 
            +
                    metadata={
         | 
| 152 | 
            +
                        "help": (
         | 
| 153 | 
            +
                            "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
         | 
| 154 | 
            +
                            "value if set."
         | 
| 155 | 
            +
                        )
         | 
| 156 | 
            +
                    },
         | 
| 157 | 
            +
                )
         | 
| 158 | 
            +
                audio_column_name: str = field(
         | 
| 159 | 
            +
                    default="audio",
         | 
| 160 | 
            +
                    metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
         | 
| 161 | 
            +
                )
         | 
| 162 | 
            +
                text_column_name: str = field(
         | 
| 163 | 
            +
                    default="text",
         | 
| 164 | 
            +
                    metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
         | 
| 165 | 
            +
                )
         | 
| 166 | 
            +
                max_duration_in_seconds: float = field(
         | 
| 167 | 
            +
                    default=20.0,
         | 
| 168 | 
            +
                    metadata={
         | 
| 169 | 
            +
                        "help": (
         | 
| 170 | 
            +
                            "Truncate audio files that are longer than `max_duration_in_seconds` seconds to"
         | 
| 171 | 
            +
                            " 'max_duration_in_seconds`"
         | 
| 172 | 
            +
                        )
         | 
| 173 | 
            +
                    },
         | 
| 174 | 
            +
                )
         | 
| 175 | 
            +
                min_duration_in_seconds: float = field(
         | 
| 176 | 
            +
                    default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
         | 
| 177 | 
            +
                )
         | 
| 178 | 
            +
                train_split_name: str = field(
         | 
| 179 | 
            +
                    default="train",
         | 
| 180 | 
            +
                    metadata={
         | 
| 181 | 
            +
                        "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
         | 
| 182 | 
            +
                    },
         | 
| 183 | 
            +
                )
         | 
| 184 | 
            +
                eval_split_name: str = field(
         | 
| 185 | 
            +
                    default="test",
         | 
| 186 | 
            +
                    metadata={
         | 
| 187 | 
            +
                        "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
         | 
| 188 | 
            +
                    },
         | 
| 189 | 
            +
                )
         | 
| 190 | 
            +
                do_lower_case: bool = field(
         | 
| 191 | 
            +
                    default=False,
         | 
| 192 | 
            +
                    metadata={"help": "Whether the target text should be lower cased."},
         | 
| 193 | 
            +
                )
         | 
| 194 | 
            +
                do_remove_punctuation: bool = field(
         | 
| 195 | 
            +
                    default=False,
         | 
| 196 | 
            +
                    metadata={"help": "Whether the target text should be striped of punctuation."},
         | 
| 197 | 
            +
                )
         | 
| 198 | 
            +
                do_normalize_eval: bool = field(
         | 
| 199 | 
            +
                    default=True,
         | 
| 200 | 
            +
                    metadata={"help": "Whether to normalise the references and predictions in the eval WER calculation."},
         | 
| 201 | 
            +
                )
         | 
| 202 | 
            +
                language: str = field(
         | 
| 203 | 
            +
                    default=None,
         | 
| 204 | 
            +
                    metadata={
         | 
| 205 | 
            +
                        "help": (
         | 
| 206 | 
            +
                            "Language for multilingual fine-tuning. This argument should be set for multilingual fine-tuning "
         | 
| 207 | 
            +
                            "only. For English speech recognition, it should be set to `None`."
         | 
| 208 | 
            +
                        )
         | 
| 209 | 
            +
                    },
         | 
| 210 | 
            +
                )
         | 
| 211 | 
            +
                task: str = field(
         | 
| 212 | 
            +
                    default="transcribe",
         | 
| 213 | 
            +
                    metadata={"help": "Task, either `transcribe` for speech recognition or `translate` for speech translation."},
         | 
| 214 | 
            +
                )
         | 
| 215 | 
            +
                shuffle_buffer_size: Optional[int] = field(
         | 
| 216 | 
            +
                    default=500,
         | 
| 217 | 
            +
                    metadata={
         | 
| 218 | 
            +
                        "help": (
         | 
| 219 | 
            +
                            "The number of streamed examples to download before shuffling them. The large the buffer, "
         | 
| 220 | 
            +
                            "the closer it is to real offline shuffling."
         | 
| 221 | 
            +
                        )
         | 
| 222 | 
            +
                    },
         | 
| 223 | 
            +
                )
         | 
| 224 | 
            +
             | 
| 225 | 
            +
             | 
| 226 | 
            +
            @dataclass
         | 
| 227 | 
            +
            class DataCollatorSpeechSeq2SeqWithPadding:
         | 
| 228 | 
            +
                """
         | 
| 229 | 
            +
                Data collator that will dynamically pad the inputs received.
         | 
| 230 | 
            +
                Args:
         | 
| 231 | 
            +
                    processor ([`WhisperProcessor`])
         | 
| 232 | 
            +
                        The processor used for processing the data.
         | 
| 233 | 
            +
                    decoder_start_token_id (`int`)
         | 
| 234 | 
            +
                        The begin-of-sentence of the decoder.
         | 
| 235 | 
            +
                """
         | 
| 236 | 
            +
             | 
| 237 | 
            +
                processor: Any
         | 
| 238 | 
            +
                decoder_start_token_id: int
         | 
| 239 | 
            +
             | 
| 240 | 
            +
                def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
         | 
| 241 | 
            +
                    # split inputs and labels since they have to be of different lengths and need
         | 
| 242 | 
            +
                    # different padding methods
         | 
| 243 | 
            +
                    model_input_name = self.processor.model_input_names[0]
         | 
| 244 | 
            +
                    input_features = [{model_input_name: feature[model_input_name]} for feature in features]
         | 
| 245 | 
            +
                    label_features = [{"input_ids": feature["labels"]} for feature in features]
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                    batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
         | 
| 248 | 
            +
             | 
| 249 | 
            +
                    labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                    # replace padding with -100 to ignore loss correctly
         | 
| 252 | 
            +
                    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
         | 
| 253 | 
            +
             | 
| 254 | 
            +
                    # if bos token is appended in previous tokenization step,
         | 
| 255 | 
            +
                    # cut bos token here as it's append later anyways
         | 
| 256 | 
            +
                    if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
         | 
| 257 | 
            +
                        labels = labels[:, 1:]
         | 
| 258 | 
            +
             | 
| 259 | 
            +
                    batch["labels"] = labels
         | 
| 260 | 
            +
             | 
| 261 | 
            +
                    return batch
         | 
| 262 | 
            +
             | 
| 263 | 
            +
             | 
| 264 | 
            +
            def load_streaming_dataset(dataset_name, dataset_config_name, split="train", **kwargs):
         | 
| 265 | 
            +
                """
         | 
| 266 | 
            +
                Utility function to load a dataset in streaming mode. For datasets with multiple splits,
         | 
| 267 | 
            +
                each split is loaded individually and then splits combined by taking alternating examples from
         | 
| 268 | 
            +
                each (interleaving).
         | 
| 269 | 
            +
                """
         | 
| 270 | 
            +
                if "+" in split:
         | 
| 271 | 
            +
                    # load multiple splits separated by the `+` symbol with streaming mode
         | 
| 272 | 
            +
                    dataset_splits = [
         | 
| 273 | 
            +
                        load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=True, **kwargs)
         | 
| 274 | 
            +
                        for split_name in split.split("+")
         | 
| 275 | 
            +
                    ]
         | 
| 276 | 
            +
                    # interleave multiple splits to form one dataset
         | 
| 277 | 
            +
                    interleaved_dataset = interleave_datasets(dataset_splits)
         | 
| 278 | 
            +
                    return interleaved_dataset
         | 
| 279 | 
            +
                else:
         | 
| 280 | 
            +
                    # load a single split *with* streaming mode
         | 
| 281 | 
            +
                    dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=True, **kwargs)
         | 
| 282 | 
            +
                    return dataset
         | 
| 283 | 
            +
             | 
| 284 | 
            +
             | 
| 285 | 
            +
            def main():
         | 
| 286 | 
            +
                # 1. Parse input arguments
         | 
| 287 | 
            +
                # See all possible arguments in src/transformers/training_args.py
         | 
| 288 | 
            +
                # or by passing the --help flag to this script.
         | 
| 289 | 
            +
                # We now keep distinct sets of args, for a cleaner separation of concerns.
         | 
| 290 | 
            +
                parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
         | 
| 291 | 
            +
             | 
| 292 | 
            +
                if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         | 
| 293 | 
            +
                    # If we pass only one argument to the script and it's the path to a json file,
         | 
| 294 | 
            +
                    # let's parse it to get our arguments.
         | 
| 295 | 
            +
                    model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
         | 
| 296 | 
            +
                else:
         | 
| 297 | 
            +
                    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
         | 
| 298 | 
            +
             | 
| 299 | 
            +
                # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
         | 
| 300 | 
            +
                # information sent is the one passed as arguments along with your Python/PyTorch versions.
         | 
| 301 | 
            +
                send_example_telemetry("run_speech_recognition_seq2seq_streaming", model_args, data_args)
         | 
| 302 | 
            +
             | 
| 303 | 
            +
                # 2. Setup logging
         | 
| 304 | 
            +
                logging.basicConfig(
         | 
| 305 | 
            +
                    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         | 
| 306 | 
            +
                    datefmt="%m/%d/%Y %H:%M:%S",
         | 
| 307 | 
            +
                    handlers=[logging.StreamHandler(sys.stdout)],
         | 
| 308 | 
            +
                )
         | 
| 309 | 
            +
                log_level = training_args.get_process_log_level()
         | 
| 310 | 
            +
                logger.setLevel(log_level)
         | 
| 311 | 
            +
                datasets.utils.logging.set_verbosity(log_level)
         | 
| 312 | 
            +
                transformers.utils.logging.set_verbosity(log_level)
         | 
| 313 | 
            +
                transformers.utils.logging.enable_default_handler()
         | 
| 314 | 
            +
                transformers.utils.logging.enable_explicit_format()
         | 
| 315 | 
            +
             | 
| 316 | 
            +
                logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
         | 
| 317 | 
            +
             | 
| 318 | 
            +
                # Log on each process the small summary:
         | 
| 319 | 
            +
                logger.warning(
         | 
| 320 | 
            +
                    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
         | 
| 321 | 
            +
                    f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
         | 
| 322 | 
            +
                )
         | 
| 323 | 
            +
                logger.info(f"Training/evaluation parameters {training_args}")
         | 
| 324 | 
            +
             | 
| 325 | 
            +
                # Set the verbosity to info of the Transformers logger (on main process only):
         | 
| 326 | 
            +
                if is_main_process(training_args.local_rank):
         | 
| 327 | 
            +
                    transformers.utils.logging.set_verbosity_info()
         | 
| 328 | 
            +
                logger.info("Training/evaluation parameters %s", training_args)
         | 
| 329 | 
            +
             | 
| 330 | 
            +
                # 3. Detecting last checkpoint and eventually continue from last checkpoint
         | 
| 331 | 
            +
                last_checkpoint = None
         | 
| 332 | 
            +
                if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         | 
| 333 | 
            +
                    last_checkpoint = get_last_checkpoint(training_args.output_dir)
         | 
| 334 | 
            +
                    if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
         | 
| 335 | 
            +
                        raise ValueError(
         | 
| 336 | 
            +
                            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
         | 
| 337 | 
            +
                            "Use --overwrite_output_dir to overcome."
         | 
| 338 | 
            +
                        )
         | 
| 339 | 
            +
                    elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
         | 
| 340 | 
            +
                        logger.info(
         | 
| 341 | 
            +
                            f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
         | 
| 342 | 
            +
                            "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
         | 
| 343 | 
            +
                        )
         | 
| 344 | 
            +
             | 
| 345 | 
            +
                # Set seed before initializing model.
         | 
| 346 | 
            +
                set_seed(training_args.seed)
         | 
| 347 | 
            +
             | 
| 348 | 
            +
                # 4. Load dataset
         | 
| 349 | 
            +
                raw_datasets = IterableDatasetDict()
         | 
| 350 | 
            +
             | 
| 351 | 
            +
                if training_args.do_train:
         | 
| 352 | 
            +
                    raw_datasets["train"] = load_streaming_dataset(
         | 
| 353 | 
            +
                        data_args.dataset_name,
         | 
| 354 | 
            +
                        data_args.dataset_config_name,
         | 
| 355 | 
            +
                        split=data_args.train_split_name,
         | 
| 356 | 
            +
                        use_auth_token=True if model_args.use_auth_token else None,
         | 
| 357 | 
            +
                    )
         | 
| 358 | 
            +
             | 
| 359 | 
            +
                if training_args.do_eval:
         | 
| 360 | 
            +
                    raw_datasets["eval"] = load_streaming_dataset(
         | 
| 361 | 
            +
                        data_args.dataset_name,
         | 
| 362 | 
            +
                        data_args.dataset_config_name,
         | 
| 363 | 
            +
                        split=data_args.eval_split_name,
         | 
| 364 | 
            +
                        use_auth_token=True if model_args.use_auth_token else None,
         | 
| 365 | 
            +
                    )
         | 
| 366 | 
            +
             | 
| 367 | 
            +
                raw_datasets_features = list(next(iter(raw_datasets.values())).features.keys())
         | 
| 368 | 
            +
             | 
| 369 | 
            +
                if data_args.audio_column_name not in raw_datasets_features:
         | 
| 370 | 
            +
                    raise ValueError(
         | 
| 371 | 
            +
                        f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
         | 
| 372 | 
            +
                        "Make sure to set `--audio_column_name` to the correct audio column - one of "
         | 
| 373 | 
            +
                        f"{', '.join(raw_datasets_features)}."
         | 
| 374 | 
            +
                    )
         | 
| 375 | 
            +
             | 
| 376 | 
            +
                if data_args.text_column_name not in raw_datasets_features:
         | 
| 377 | 
            +
                    raise ValueError(
         | 
| 378 | 
            +
                        f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
         | 
| 379 | 
            +
                        "Make sure to set `--text_column_name` to the correct text column - one of "
         | 
| 380 | 
            +
                        f"{', '.join(raw_datasets_features)}."
         | 
| 381 | 
            +
                    )
         | 
| 382 | 
            +
             | 
| 383 | 
            +
                # 5. Load pretrained model, tokenizer, and feature extractor
         | 
| 384 | 
            +
                #
         | 
| 385 | 
            +
                # Distributed training:
         | 
| 386 | 
            +
                # The .from_pretrained methods guarantee that only one local process can concurrently
         | 
| 387 | 
            +
                config = AutoConfig.from_pretrained(
         | 
| 388 | 
            +
                    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
         | 
| 389 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 390 | 
            +
                    revision=model_args.model_revision,
         | 
| 391 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 392 | 
            +
                )
         | 
| 393 | 
            +
             | 
| 394 | 
            +
                config.update({"forced_decoder_ids": model_args.forced_decoder_ids, "suppress_tokens": model_args.suppress_tokens})
         | 
| 395 | 
            +
             | 
| 396 | 
            +
                feature_extractor = AutoFeatureExtractor.from_pretrained(
         | 
| 397 | 
            +
                    model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
         | 
| 398 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 399 | 
            +
                    revision=model_args.model_revision,
         | 
| 400 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 401 | 
            +
                )
         | 
| 402 | 
            +
                tokenizer = AutoTokenizer.from_pretrained(
         | 
| 403 | 
            +
                    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         | 
| 404 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 405 | 
            +
                    use_fast=model_args.use_fast_tokenizer,
         | 
| 406 | 
            +
                    revision=model_args.model_revision,
         | 
| 407 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 408 | 
            +
                )
         | 
| 409 | 
            +
                model = AutoModelForSpeechSeq2Seq.from_pretrained(
         | 
| 410 | 
            +
                    model_args.model_name_or_path,
         | 
| 411 | 
            +
                    config=config,
         | 
| 412 | 
            +
                    cache_dir=model_args.cache_dir,
         | 
| 413 | 
            +
                    revision=model_args.model_revision,
         | 
| 414 | 
            +
                    use_auth_token=True if model_args.use_auth_token else None,
         | 
| 415 | 
            +
                )
         | 
| 416 | 
            +
                model.config.use_cache = False
         | 
| 417 | 
            +
             | 
| 418 | 
            +
                if model.config.decoder_start_token_id is None:
         | 
| 419 | 
            +
                    raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
         | 
| 420 | 
            +
             | 
| 421 | 
            +
                if model_args.freeze_feature_encoder:
         | 
| 422 | 
            +
                    model.freeze_feature_encoder()
         | 
| 423 | 
            +
             | 
| 424 | 
            +
                if model_args.freeze_encoder:
         | 
| 425 | 
            +
                    model.freeze_encoder()
         | 
| 426 | 
            +
                    model.model.encoder.gradient_checkpointing = False
         | 
| 427 | 
            +
             | 
| 428 | 
            +
                if data_args.language is not None:
         | 
| 429 | 
            +
                    # We only need to set the task id when the language is specified (i.e. in a multilingual setting)
         | 
| 430 | 
            +
                    tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task)
         | 
| 431 | 
            +
             | 
| 432 | 
            +
                # 6. Resample speech dataset if necessary
         | 
| 433 | 
            +
                dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
         | 
| 434 | 
            +
                if dataset_sampling_rate != feature_extractor.sampling_rate:
         | 
| 435 | 
            +
                    raw_datasets = raw_datasets.cast_column(
         | 
| 436 | 
            +
                        data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
         | 
| 437 | 
            +
                    )
         | 
| 438 | 
            +
             | 
| 439 | 
            +
                # 7. Preprocessing the datasets.
         | 
| 440 | 
            +
                # We need to read the audio files as arrays and tokenize the targets.
         | 
| 441 | 
            +
                max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
         | 
| 442 | 
            +
                min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
         | 
| 443 | 
            +
                audio_column_name = data_args.audio_column_name
         | 
| 444 | 
            +
                text_column_name = data_args.text_column_name
         | 
| 445 | 
            +
                model_input_name = feature_extractor.model_input_names[0]
         | 
| 446 | 
            +
                do_lower_case = data_args.do_lower_case
         | 
| 447 | 
            +
                do_remove_punctuation = data_args.do_remove_punctuation
         | 
| 448 | 
            +
                normalizer = BasicTextNormalizer()  # 'official' text normalizer from OpenAI
         | 
| 449 | 
            +
             | 
| 450 | 
            +
                if data_args.max_train_samples is not None:
         | 
| 451 | 
            +
                    raw_datasets["train"] = raw_datasets["train"].take(data_args.max_train_samples)
         | 
| 452 | 
            +
             | 
| 453 | 
            +
                if data_args.max_eval_samples is not None:
         | 
| 454 | 
            +
                    raw_datasets["eval"] = raw_datasets["eval"].take(data_args.max_eval_samples)
         | 
| 455 | 
            +
             | 
| 456 | 
            +
                def prepare_dataset(batch):
         | 
| 457 | 
            +
                    # process audio
         | 
| 458 | 
            +
                    sample = batch[audio_column_name]
         | 
| 459 | 
            +
                    inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
         | 
| 460 | 
            +
                    # process audio length
         | 
| 461 | 
            +
                    batch[model_input_name] = inputs.get(model_input_name)[0]
         | 
| 462 | 
            +
                    batch["input_length"] = len(sample["array"])
         | 
| 463 | 
            +
             | 
| 464 | 
            +
                    # process targets
         | 
| 465 | 
            +
                    input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
         | 
| 466 | 
            +
                    if do_remove_punctuation:
         | 
| 467 | 
            +
                        input_str = normalizer(input_str).strip()
         | 
| 468 | 
            +
                    batch["labels"] = tokenizer(input_str).input_ids
         | 
| 469 | 
            +
                    return batch
         | 
| 470 | 
            +
             | 
| 471 | 
            +
                with training_args.main_process_first(desc="dataset map pre-processing"):
         | 
| 472 | 
            +
                    vectorized_datasets = raw_datasets.map(
         | 
| 473 | 
            +
                        prepare_dataset,
         | 
| 474 | 
            +
                        remove_columns=raw_datasets_features,
         | 
| 475 | 
            +
                    ).with_format("torch")
         | 
| 476 | 
            +
             | 
| 477 | 
            +
                    if training_args.do_train:
         | 
| 478 | 
            +
                        vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(
         | 
| 479 | 
            +
                            buffer_size=data_args.shuffle_buffer_size,
         | 
| 480 | 
            +
                            seed=training_args.seed,
         | 
| 481 | 
            +
                        )
         | 
| 482 | 
            +
             | 
| 483 | 
            +
                # filter training data that is shorter than min_input_length or longer than
         | 
| 484 | 
            +
                # max_input_length
         | 
| 485 | 
            +
                def is_audio_in_length_range(length):
         | 
| 486 | 
            +
                    return min_input_length < length < max_input_length
         | 
| 487 | 
            +
             | 
| 488 | 
            +
                vectorized_datasets["train"] = vectorized_datasets["train"].filter(
         | 
| 489 | 
            +
                    is_audio_in_length_range,
         | 
| 490 | 
            +
                    input_columns=["input_length"],
         | 
| 491 | 
            +
                )
         | 
| 492 | 
            +
             | 
| 493 | 
            +
                # 8. Load Metric
         | 
| 494 | 
            +
                metric = evaluate.load("wer")
         | 
| 495 | 
            +
                do_normalize_eval = data_args.do_normalize_eval
         | 
| 496 | 
            +
             | 
| 497 | 
            +
                def compute_metrics(pred):
         | 
| 498 | 
            +
                    pred_ids = pred.predictions
         | 
| 499 | 
            +
             | 
| 500 | 
            +
                    pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
         | 
| 501 | 
            +
             | 
| 502 | 
            +
                    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
         | 
| 503 | 
            +
                    # we do not want to group tokens when computing the metrics
         | 
| 504 | 
            +
                    label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
         | 
| 505 | 
            +
             | 
| 506 | 
            +
                    if do_normalize_eval:
         | 
| 507 | 
            +
                        pred_str = [normalizer(pred) for pred in pred_str]
         | 
| 508 | 
            +
                        label_str = [normalizer(label) for label in label_str]
         | 
| 509 | 
            +
             | 
| 510 | 
            +
                    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
         | 
| 511 | 
            +
             | 
| 512 | 
            +
                    return {"wer": wer}
         | 
| 513 | 
            +
             | 
| 514 | 
            +
                # 9. Create a single speech processor
         | 
| 515 | 
            +
                if is_main_process(training_args.local_rank):
         | 
| 516 | 
            +
                    # save feature extractor, tokenizer and config
         | 
| 517 | 
            +
                    feature_extractor.save_pretrained(training_args.output_dir)
         | 
| 518 | 
            +
                    tokenizer.save_pretrained(training_args.output_dir)
         | 
| 519 | 
            +
                    config.save_pretrained(training_args.output_dir)
         | 
| 520 | 
            +
             | 
| 521 | 
            +
                processor = AutoProcessor.from_pretrained(training_args.output_dir)
         | 
| 522 | 
            +
             | 
| 523 | 
            +
                # 10. Define data collator
         | 
| 524 | 
            +
                data_collator = DataCollatorSpeechSeq2SeqWithPadding(
         | 
| 525 | 
            +
                    processor=processor,
         | 
| 526 | 
            +
                    decoder_start_token_id=model.config.decoder_start_token_id,
         | 
| 527 | 
            +
                )
         | 
| 528 | 
            +
             | 
| 529 | 
            +
                # 11. Configure Trainer
         | 
| 530 | 
            +
                # Trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
         | 
| 531 | 
            +
                class ShuffleCallback(TrainerCallback):
         | 
| 532 | 
            +
                    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
         | 
| 533 | 
            +
                        if isinstance(train_dataloader.dataset, IterableDatasetShard):
         | 
| 534 | 
            +
                            pass  # set_epoch() is handled by the Trainer
         | 
| 535 | 
            +
                        elif isinstance(train_dataloader.dataset, IterableDataset):
         | 
| 536 | 
            +
                            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
         | 
| 537 | 
            +
             | 
| 538 | 
            +
                # Initialize Trainer
         | 
| 539 | 
            +
                trainer = Seq2SeqTrainer(
         | 
| 540 | 
            +
                    model=model,
         | 
| 541 | 
            +
                    args=training_args,
         | 
| 542 | 
            +
                    train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
         | 
| 543 | 
            +
                    eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
         | 
| 544 | 
            +
                    tokenizer=feature_extractor,
         | 
| 545 | 
            +
                    data_collator=data_collator,
         | 
| 546 | 
            +
                    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
         | 
| 547 | 
            +
                    callbacks=[ShuffleCallback()],
         | 
| 548 | 
            +
                )
         | 
| 549 | 
            +
             | 
| 550 | 
            +
                # 12. Training
         | 
| 551 | 
            +
                if training_args.do_train:
         | 
| 552 | 
            +
                    checkpoint = None
         | 
| 553 | 
            +
                    if training_args.resume_from_checkpoint is not None:
         | 
| 554 | 
            +
                        checkpoint = training_args.resume_from_checkpoint
         | 
| 555 | 
            +
                    elif last_checkpoint is not None:
         | 
| 556 | 
            +
                        checkpoint = last_checkpoint
         | 
| 557 | 
            +
                    train_result = trainer.train(resume_from_checkpoint=checkpoint)
         | 
| 558 | 
            +
                    trainer.save_model()  # Saves the feature extractor too for easy upload
         | 
| 559 | 
            +
             | 
| 560 | 
            +
                    metrics = train_result.metrics
         | 
| 561 | 
            +
                    if data_args.max_train_samples:
         | 
| 562 | 
            +
                        metrics["train_samples"] = data_args.max_train_samples
         | 
| 563 | 
            +
                    trainer.log_metrics("train", metrics)
         | 
| 564 | 
            +
                    trainer.save_metrics("train", metrics)
         | 
| 565 | 
            +
                    trainer.save_state()
         | 
| 566 | 
            +
             | 
| 567 | 
            +
                # 13. Evaluation
         | 
| 568 | 
            +
                results = {}
         | 
| 569 | 
            +
                if training_args.do_eval:
         | 
| 570 | 
            +
                    logger.info("*** Evaluate ***")
         | 
| 571 | 
            +
                    metrics = trainer.evaluate(
         | 
| 572 | 
            +
                        metric_key_prefix="eval",
         | 
| 573 | 
            +
                        max_length=training_args.generation_max_length,
         | 
| 574 | 
            +
                        num_beams=training_args.generation_num_beams,
         | 
| 575 | 
            +
                    )
         | 
| 576 | 
            +
                    if data_args.max_eval_samples:
         | 
| 577 | 
            +
                        metrics["eval_samples"] = data_args.max_eval_samples
         | 
| 578 | 
            +
             | 
| 579 | 
            +
                    trainer.log_metrics("eval", metrics)
         | 
| 580 | 
            +
                    trainer.save_metrics("eval", metrics)
         | 
| 581 | 
            +
             | 
| 582 | 
            +
                # 14. Write Training Stats
         | 
| 583 | 
            +
                kwargs = {
         | 
| 584 | 
            +
                    "finetuned_from": model_args.model_name_or_path,
         | 
| 585 | 
            +
                    "tasks": "automatic-speech-recognition",
         | 
| 586 | 
            +
                    "tags": "whisper-event",
         | 
| 587 | 
            +
                }
         | 
| 588 | 
            +
                if data_args.dataset_name is not None:
         | 
| 589 | 
            +
                    kwargs["dataset_tags"] = data_args.dataset_name
         | 
| 590 | 
            +
                    if data_args.dataset_config_name is not None:
         | 
| 591 | 
            +
                        kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
         | 
| 592 | 
            +
                    else:
         | 
| 593 | 
            +
                        kwargs["dataset"] = data_args.dataset_name
         | 
| 594 | 
            +
                    if "common_voice" in data_args.dataset_name:
         | 
| 595 | 
            +
                        kwargs["language"] = data_args.dataset_config_name
         | 
| 596 | 
            +
                    if model_args.model_index_name is not None:
         | 
| 597 | 
            +
                        kwargs["model_name"] = model_args.model_index_name
         | 
| 598 | 
            +
             | 
| 599 | 
            +
                if training_args.push_to_hub:
         | 
| 600 | 
            +
                    trainer.push_to_hub(**kwargs)
         | 
| 601 | 
            +
                else:
         | 
| 602 | 
            +
                    trainer.create_model_card(**kwargs)
         | 
| 603 | 
            +
             | 
| 604 | 
            +
                return results
         | 
| 605 | 
            +
             | 
| 606 | 
            +
             | 
| 607 | 
            +
            if __name__ == "__main__":
         | 
| 608 | 
            +
                main()
         | 
    	
        runs/Dec13_21-34-37_129-146-107-47/1670967296.8737977/events.out.tfevents.1670967296.129-146-107-47.73247.1
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:5160189202b52349fbac20f9ac1c1bda0acd2794459c2af2f9ad1779922d53af
         | 
| 3 | 
            +
            size 5870
         | 
    	
        runs/Dec13_21-34-37_129-146-107-47/events.out.tfevents.1670967296.129-146-107-47.73247.0
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:8d886b006de4e8fa0badc80bad78a1ebfbda5dda3d7207c26a844bec32877430
         | 
| 3 | 
            +
            size 4270
         | 
    	
        runs/Dec13_21-37-24_129-146-107-47/1670967464.0219538/events.out.tfevents.1670967464.129-146-107-47.73685.1
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:fe83f4d104a2aa6cff38ec4b86e0a15e1de74769cb1eb8fb7fb09c0577791437
         | 
| 3 | 
            +
            size 5870
         | 
    	
        runs/Dec13_21-37-24_129-146-107-47/events.out.tfevents.1670967464.129-146-107-47.73685.0
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2b539ae9b19b47c25c56516612de0677cca124d183bd364ed80f33facf5de9ab
         | 
| 3 | 
            +
            size 10853
         | 
    	
        setup_env.sh
    ADDED
    
    | @@ -0,0 +1,10 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            cd ~
         | 
| 2 | 
            +
            sudo add-apt-repository -y ppa:jonathonf/ffmpeg-4
         | 
| 3 | 
            +
            sudo apt update
         | 
| 4 | 
            +
            sudo apt install -y ffmpeg
         | 
| 5 | 
            +
            sudo apt-get install git-lfs
         | 
| 6 | 
            +
            env_name=whisper
         | 
| 7 | 
            +
            python3 -m venv $env_name
         | 
| 8 | 
            +
            echo "source ~/$env_name/bin/activate" >> ~/.bashrc
         | 
| 9 | 
            +
            cd whisper_sprint
         | 
| 10 | 
            +
            bash
         | 
    	
        setup_jupyter.sh
    ADDED
    
    | @@ -0,0 +1,4 @@ | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            pip install jupyterlab
         | 
| 2 | 
            +
            python -m ipykernel install --user --name=whisper
         | 
| 3 | 
            +
            tmux new -s mysession
         | 
| 4 | 
            +
            jupyter lab --port 8888
         | 
    	
        setup_libs.sh
    ADDED
    
    | @@ -0,0 +1,12 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            pip install -r requirements.txt
         | 
| 2 | 
            +
            git lfs install
         | 
| 3 | 
            +
            python -c "import torch; print(torch.cuda.is_available())"
         | 
| 4 | 
            +
            git config --global credential.helper store
         | 
| 5 | 
            +
            huggingface-cli login
         | 
| 6 | 
            +
            huggingface-cli repo create $2
         | 
| 7 | 
            +
            git clone https://huggingface.co/$1/$2
         | 
| 8 | 
            +
            cd $2
         | 
| 9 | 
            +
            cp ../**.py .
         | 
| 10 | 
            +
            cp ../**.sh .
         | 
| 11 | 
            +
            cp ../**.ipynb .
         | 
| 12 | 
            +
            wget https://raw.githubusercontent.com/huggingface/community-events/main/whisper-fine-tuning-event/fine-tune-whisper-non-streaming.ipynb
         | 
    	
        setup_libs_colab.sh
    ADDED
    
    | @@ -0,0 +1,11 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            pip install -r requirements_colab.txt
         | 
| 2 | 
            +
            git lfs install
         | 
| 3 | 
            +
            python -c "import torch; print(torch.cuda.is_available())"
         | 
| 4 | 
            +
            git config --global credential.helper store
         | 
| 5 | 
            +
            huggingface-cli login
         | 
| 6 | 
            +
            huggingface-cli repo create $2
         | 
| 7 | 
            +
            git clone https://huggingface.co/$1/$2
         | 
| 8 | 
            +
            cd $2
         | 
| 9 | 
            +
            cp ../**.py .
         | 
| 10 | 
            +
            cp ../**.sh .
         | 
| 11 | 
            +
            cp ../**.ipynb .
         | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,133 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "additional_special_tokens": [
         | 
| 3 | 
            +
                "<|endoftext|>",
         | 
| 4 | 
            +
                "<|startoftranscript|>",
         | 
| 5 | 
            +
                "<|en|>",
         | 
| 6 | 
            +
                "<|zh|>",
         | 
| 7 | 
            +
                "<|de|>",
         | 
| 8 | 
            +
                "<|es|>",
         | 
| 9 | 
            +
                "<|ru|>",
         | 
| 10 | 
            +
                "<|ko|>",
         | 
| 11 | 
            +
                "<|fr|>",
         | 
| 12 | 
            +
                "<|ja|>",
         | 
| 13 | 
            +
                "<|pt|>",
         | 
| 14 | 
            +
                "<|tr|>",
         | 
| 15 | 
            +
                "<|pl|>",
         | 
| 16 | 
            +
                "<|ca|>",
         | 
| 17 | 
            +
                "<|nl|>",
         | 
| 18 | 
            +
                "<|ar|>",
         | 
| 19 | 
            +
                "<|sv|>",
         | 
| 20 | 
            +
                "<|it|>",
         | 
| 21 | 
            +
                "<|id|>",
         | 
| 22 | 
            +
                "<|hi|>",
         | 
| 23 | 
            +
                "<|fi|>",
         | 
| 24 | 
            +
                "<|vi|>",
         | 
| 25 | 
            +
                "<|iw|>",
         | 
| 26 | 
            +
                "<|uk|>",
         | 
| 27 | 
            +
                "<|el|>",
         | 
| 28 | 
            +
                "<|ms|>",
         | 
| 29 | 
            +
                "<|cs|>",
         | 
| 30 | 
            +
                "<|ro|>",
         | 
| 31 | 
            +
                "<|da|>",
         | 
| 32 | 
            +
                "<|hu|>",
         | 
| 33 | 
            +
                "<|ta|>",
         | 
| 34 | 
            +
                "<|no|>",
         | 
| 35 | 
            +
                "<|th|>",
         | 
| 36 | 
            +
                "<|ur|>",
         | 
| 37 | 
            +
                "<|hr|>",
         | 
| 38 | 
            +
                "<|bg|>",
         | 
| 39 | 
            +
                "<|lt|>",
         | 
| 40 | 
            +
                "<|la|>",
         | 
| 41 | 
            +
                "<|mi|>",
         | 
| 42 | 
            +
                "<|ml|>",
         | 
| 43 | 
            +
                "<|cy|>",
         | 
| 44 | 
            +
                "<|sk|>",
         | 
| 45 | 
            +
                "<|te|>",
         | 
| 46 | 
            +
                "<|fa|>",
         | 
| 47 | 
            +
                "<|lv|>",
         | 
| 48 | 
            +
                "<|bn|>",
         | 
| 49 | 
            +
                "<|sr|>",
         | 
| 50 | 
            +
                "<|az|>",
         | 
| 51 | 
            +
                "<|sl|>",
         | 
| 52 | 
            +
                "<|kn|>",
         | 
| 53 | 
            +
                "<|et|>",
         | 
| 54 | 
            +
                "<|mk|>",
         | 
| 55 | 
            +
                "<|br|>",
         | 
| 56 | 
            +
                "<|eu|>",
         | 
| 57 | 
            +
                "<|is|>",
         | 
| 58 | 
            +
                "<|hy|>",
         | 
| 59 | 
            +
                "<|ne|>",
         | 
| 60 | 
            +
                "<|mn|>",
         | 
| 61 | 
            +
                "<|bs|>",
         | 
| 62 | 
            +
                "<|kk|>",
         | 
| 63 | 
            +
                "<|sq|>",
         | 
| 64 | 
            +
                "<|sw|>",
         | 
| 65 | 
            +
                "<|gl|>",
         | 
| 66 | 
            +
                "<|mr|>",
         | 
| 67 | 
            +
                "<|pa|>",
         | 
| 68 | 
            +
                "<|si|>",
         | 
| 69 | 
            +
                "<|km|>",
         | 
| 70 | 
            +
                "<|sn|>",
         | 
| 71 | 
            +
                "<|yo|>",
         | 
| 72 | 
            +
                "<|so|>",
         | 
| 73 | 
            +
                "<|af|>",
         | 
| 74 | 
            +
                "<|oc|>",
         | 
| 75 | 
            +
                "<|ka|>",
         | 
| 76 | 
            +
                "<|be|>",
         | 
| 77 | 
            +
                "<|tg|>",
         | 
| 78 | 
            +
                "<|sd|>",
         | 
| 79 | 
            +
                "<|gu|>",
         | 
| 80 | 
            +
                "<|am|>",
         | 
| 81 | 
            +
                "<|yi|>",
         | 
| 82 | 
            +
                "<|lo|>",
         | 
| 83 | 
            +
                "<|uz|>",
         | 
| 84 | 
            +
                "<|fo|>",
         | 
| 85 | 
            +
                "<|ht|>",
         | 
| 86 | 
            +
                "<|ps|>",
         | 
| 87 | 
            +
                "<|tk|>",
         | 
| 88 | 
            +
                "<|nn|>",
         | 
| 89 | 
            +
                "<|mt|>",
         | 
| 90 | 
            +
                "<|sa|>",
         | 
| 91 | 
            +
                "<|lb|>",
         | 
| 92 | 
            +
                "<|my|>",
         | 
| 93 | 
            +
                "<|bo|>",
         | 
| 94 | 
            +
                "<|tl|>",
         | 
| 95 | 
            +
                "<|mg|>",
         | 
| 96 | 
            +
                "<|as|>",
         | 
| 97 | 
            +
                "<|tt|>",
         | 
| 98 | 
            +
                "<|haw|>",
         | 
| 99 | 
            +
                "<|ln|>",
         | 
| 100 | 
            +
                "<|ha|>",
         | 
| 101 | 
            +
                "<|ba|>",
         | 
| 102 | 
            +
                "<|jw|>",
         | 
| 103 | 
            +
                "<|su|>",
         | 
| 104 | 
            +
                "<|translate|>",
         | 
| 105 | 
            +
                "<|transcribe|>",
         | 
| 106 | 
            +
                "<|startoflm|>",
         | 
| 107 | 
            +
                "<|startofprev|>",
         | 
| 108 | 
            +
                "<|nocaptions|>",
         | 
| 109 | 
            +
                "<|notimestamps|>"
         | 
| 110 | 
            +
              ],
         | 
| 111 | 
            +
              "bos_token": {
         | 
| 112 | 
            +
                "content": "<|endoftext|>",
         | 
| 113 | 
            +
                "lstrip": false,
         | 
| 114 | 
            +
                "normalized": true,
         | 
| 115 | 
            +
                "rstrip": false,
         | 
| 116 | 
            +
                "single_word": false
         | 
| 117 | 
            +
              },
         | 
| 118 | 
            +
              "eos_token": {
         | 
| 119 | 
            +
                "content": "<|endoftext|>",
         | 
| 120 | 
            +
                "lstrip": false,
         | 
| 121 | 
            +
                "normalized": true,
         | 
| 122 | 
            +
                "rstrip": false,
         | 
| 123 | 
            +
                "single_word": false
         | 
| 124 | 
            +
              },
         | 
| 125 | 
            +
              "pad_token": "<|endoftext|>",
         | 
| 126 | 
            +
              "unk_token": {
         | 
| 127 | 
            +
                "content": "",
         | 
| 128 | 
            +
                "lstrip": false,
         | 
| 129 | 
            +
                "normalized": true,
         | 
| 130 | 
            +
                "rstrip": false,
         | 
| 131 | 
            +
                "single_word": false
         | 
| 132 | 
            +
              }
         | 
| 133 | 
            +
            }
         | 
    	
        split_mgb2_test.py
    ADDED
    
    | @@ -0,0 +1,20 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import soundfile as sf
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            os.makedirs("dataset", exist_ok=True)
         | 
| 5 | 
            +
            archive_path = "test"
         | 
| 6 | 
            +
            wav_dir = os.path.join(archive_path, "wav")
         | 
| 7 | 
            +
            segments_file = os.path.join(archive_path, "text.all")
         | 
| 8 | 
            +
            with open(segments_file, "r", encoding="utf-8") as f:
         | 
| 9 | 
            +
                for _id, line in enumerate(f):
         | 
| 10 | 
            +
                    segment = line.split(" ")[0]
         | 
| 11 | 
            +
                    text = " ".join(line.split(" ")[1:])
         | 
| 12 | 
            +
                    wav_name, _, time = segment.split("_")
         | 
| 13 | 
            +
                    time = time.replace("seg-", "")
         | 
| 14 | 
            +
                    start, stop = time.split(":")
         | 
| 15 | 
            +
                    start = int(int(start) / 100 * 16_000)
         | 
| 16 | 
            +
                    stop = int(int(stop) / 100 * 16_000)
         | 
| 17 | 
            +
                    wav_path = os.path.join(wav_dir, wav_name + ".wav")
         | 
| 18 | 
            +
                    sound, _ = sf.read(wav_path, start=start, stop=stop)
         | 
| 19 | 
            +
                    sf.write(f"dataset/{segment}.wav", sound, 16_000)
         | 
| 20 | 
            +
                    open(f"dataset/{segment}.txt", "w").write(text)
         | 
    	
        split_xml_mgb2.py
    ADDED
    
    | @@ -0,0 +1,48 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from pathlib import Path 
         | 
| 2 | 
            +
            import soundfile as sf
         | 
| 3 | 
            +
            import xml.etree.ElementTree as ET
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            split = "train" # or "dev"
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            # set the following path to where you
         | 
| 8 | 
            +
            # extracted the mgb2 archive
         | 
| 9 | 
            +
            archive_path = Path("data/train")
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            wav_dir = archive_path / "wav"
         | 
| 12 | 
            +
            segments_file = archive_path / "xml" / "utf8"
         | 
| 13 | 
            +
            # output directories
         | 
| 14 | 
            +
            output_wav_dir = archive_path / "dataset" / split /"wav"
         | 
| 15 | 
            +
            output_txt_dir = archive_path / "dataset" / split /"txt"
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            # create directories for output datasets
         | 
| 18 | 
            +
            output_wav_dir.mkdir(parents=True, exist_ok=True)
         | 
| 19 | 
            +
            output_txt_dir.mkdir(parents=True, exist_ok=True)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            # for all xml segments files under utf8 directory from archive
         | 
| 22 | 
            +
            for s_file in segments_file.glob("*.xml"):
         | 
| 23 | 
            +
                tree = ET.parse(str(s_file))
         | 
| 24 | 
            +
                root = tree.getroot()
         | 
| 25 | 
            +
                head = root[0]
         | 
| 26 | 
            +
                segments = root[1][0]
         | 
| 27 | 
            +
                
         | 
| 28 | 
            +
                # get the name of the wav file form the recording tag
         | 
| 29 | 
            +
                for child in head:
         | 
| 30 | 
            +
                    if child.tag == "recording":
         | 
| 31 | 
            +
                        print(child.attrib)
         | 
| 32 | 
            +
                        file_name = child.attrib.get("filename")
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                # get the start and end times from the segment under segments tag
         | 
| 35 | 
            +
                # and join the text from each segment to construct the transcript
         | 
| 36 | 
            +
                for segment in segments:
         | 
| 37 | 
            +
                    start_time = int(float(segment.attrib.get("starttime")) *16_000)
         | 
| 38 | 
            +
                    end_time = int(float(segment.attrib.get("endtime")) * 16_000)
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    text = " ".join([x.text for x in segment])
         | 
| 41 | 
            +
             | 
| 42 | 
            +
             | 
| 43 | 
            +
                    # now store the meta data and the correctly sampled wav file in the correct
         | 
| 44 | 
            +
                    # output directories
         | 
| 45 | 
            +
                    wav_path = wav_dir / f"{file_name}.wav"
         | 
| 46 | 
            +
                    sound, _ = sf.read(wav_path, start=start_time, stop=end_time)
         | 
| 47 | 
            +
                    sf.write(output_wav_dir / f"{file_name}_seg{start_time}_{end_time}.wav", sound, 16_000)
         | 
| 48 | 
            +
                    open(output_txt_dir / f"{file_name}_seg{start_time}_{end_time}.txt", "w").write(text)
         | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,36 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "add_bos_token": false,
         | 
| 3 | 
            +
              "add_prefix_space": false,
         | 
| 4 | 
            +
              "bos_token": {
         | 
| 5 | 
            +
                "__type": "AddedToken",
         | 
| 6 | 
            +
                "content": "<|endoftext|>",
         | 
| 7 | 
            +
                "lstrip": false,
         | 
| 8 | 
            +
                "normalized": true,
         | 
| 9 | 
            +
                "rstrip": false,
         | 
| 10 | 
            +
                "single_word": false
         | 
| 11 | 
            +
              },
         | 
| 12 | 
            +
              "eos_token": {
         | 
| 13 | 
            +
                "__type": "AddedToken",
         | 
| 14 | 
            +
                "content": "<|endoftext|>",
         | 
| 15 | 
            +
                "lstrip": false,
         | 
| 16 | 
            +
                "normalized": true,
         | 
| 17 | 
            +
                "rstrip": false,
         | 
| 18 | 
            +
                "single_word": false
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "errors": "replace",
         | 
| 21 | 
            +
              "model_max_length": 1024,
         | 
| 22 | 
            +
              "name_or_path": "openai/whisper-medium",
         | 
| 23 | 
            +
              "pad_token": null,
         | 
| 24 | 
            +
              "processor_class": "WhisperProcessor",
         | 
| 25 | 
            +
              "return_attention_mask": false,
         | 
| 26 | 
            +
              "special_tokens_map_file": null,
         | 
| 27 | 
            +
              "tokenizer_class": "WhisperTokenizer",
         | 
| 28 | 
            +
              "unk_token": {
         | 
| 29 | 
            +
                "__type": "AddedToken",
         | 
| 30 | 
            +
                "content": "",
         | 
| 31 | 
            +
                "lstrip": false,
         | 
| 32 | 
            +
                "normalized": true,
         | 
| 33 | 
            +
                "rstrip": false,
         | 
| 34 | 
            +
                "single_word": false
         | 
| 35 | 
            +
              }
         | 
| 36 | 
            +
            }
         | 
    	
        training_args.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:660602fd31ec8bd0e852fc8fc2e58c5fce5c255ed3830a543bb8adcecd582e76
         | 
| 3 | 
            +
            size 3579
         | 
    	
        vocab.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 

