| { | |
| "model_name": "AutoModel", | |
| "hidden_size": 768, | |
| "num_attention_heads": 12, | |
| "num_hidden_layers": 12, | |
| "intermediate_size": 3072, | |
| "hidden_dropout_prob": 0.1, | |
| "attention_probs_dropout_prob": 0.1, | |
| "image_size": 224, | |
| "image_channels": 3, | |
| "patch_size": 16, | |
| "max_position_embeddings": 512, | |
| "vocab_size": 30522, | |
| "type_vocab_size": 2, | |
| "audio_sample_rate": 16000, | |
| "audio_frame_size": 1024, | |
| "audio_hop_size": 512, | |
| "enable_vqa": true, | |
| "enable_caption": true, | |
| "enable_retrieval": true, | |
| "enable_asr": true, | |
| "enable_realtime_asr": true, | |
| "batch_size": 32, | |
| "learning_rate": 0.0001, | |
| "weight_decay": 0.01, | |
| "warmup_steps": 10000, | |
| "max_steps": 100000 | |
| } |