| DEFAULT_TEST_DATASET = dict( | |
| flickr=dict( | |
| filename='./reactiondata/real_test.jsonl', | |
| image_folder='./reaction_image', | |
| template_file='./config/_base_/dataset/template/reaction.json', | |
| type='FlickrDataset'), | |
| reg=dict( | |
| filename='./reactiondata/train_OCR.jsonl', | |
| image_folder='./reaction_image_OCR', | |
| template_file='./config/_base_/dataset/template/OCR.json', | |
| type='REGDataset')) | |
| DEFAULT_TRAIN_DATASET = dict( | |
| flickr=dict( | |
| filename='./reactiondata/reaction_real_structed.jsonl', | |
| image_folder='./reaction_image', | |
| template_file='./config/_base_/dataset/template/reaction.json', | |
| type='FlickrDataset'), | |
| reg=dict( | |
| filename='./reactiondata/train_OCR.jsonl', | |
| image_folder='./reaction_image_OCR', | |
| template_file='./config/_base_/dataset/template/OCR.json', | |
| type='REGDataset')) | |
| data_args = dict( | |
| collator_kwargs=dict(max_length=1024, padding=True), | |
| compute_metric=None, | |
| gen_kwargs=dict(max_new_tokens=1024, num_beams=1), | |
| test=None, | |
| train=dict( | |
| cfgs=[ | |
| dict( | |
| filename='./reactiondata/train_OCR.jsonl', | |
| image_folder='./reaction_image_OCR', | |
| template_file='./config/_base_/dataset/template/OCR.json', | |
| type='REGDataset'), | |
| dict( | |
| filename='./reactiondata/reaction_real_structed.jsonl', | |
| image_folder='./reaction_image', | |
| template_file='./config/_base_/dataset/template/reaction.json', | |
| type='FlickrDataset'), | |
| ], | |
| probabilities=[ | |
| 0.0, | |
| 1, | |
| ], | |
| seed=None, | |
| stopping_strategy='first_exhausted', | |
| type='InterleaveDateset'), | |
| validation=dict( | |
| cfgs=[ | |
| dict( | |
| filename='./reactiondata/real_test.jsonl', | |
| image_folder='./reaction_image', | |
| template_file='./config/_base_/dataset/template/reaction.json', | |
| type='FlickrDataset'), | |
| ], | |
| type='ConcatDatasetWithShuffle')) | |
| model_args = dict( | |
| cache_dir=None, | |
| conv_args=dict( | |
| conv_template='vicuna_v1.1', | |
| tokenize_kwargs=dict(truncation_size=2048)), | |
| freeze_backbone=False, | |
| freeze_mm_mlp_adapter=False, | |
| gen_kwargs_set_bos_token_id=True, | |
| gen_kwargs_set_eos_token_id=True, | |
| gen_kwargs_set_pad_token_id=True, | |
| image_token_len=300, | |
| mm_use_im_start_end=True, | |
| mm_vision_select_layer=-2, | |
| model_max_length=2048, | |
| model_name_or_path='./exp/reaction_4.2.1', | |
| pretrain_mm_mlp_adapter=None, | |
| process_func_args=dict( | |
| conv=dict(type='ShikraConvProcess'), | |
| image=dict(type='ShikraImageProcessor'), | |
| target=dict(type='BoxFormatProcess'), | |
| text=dict(type='ShikraTextProcess')), | |
| sep_image_conv_front=False, | |
| target_processor=dict(boxes=dict(type='PlainBoxFormatter')), | |
| tune_mm_mlp_adapter=False, | |
| type='shikra', | |
| version='v1', | |
| vision_tower='SenseTime/deformable-detr') | |
| training_args = dict( | |
| bf16=True, | |
| dataloader_num_workers=4, | |
| do_eval=False, | |
| do_predict=False, | |
| do_train=True, | |
| evaluation_strategy='no', | |
| fsdp='full_shard auto_wrap', | |
| fsdp_transformer_layer_cls_to_wrap='LlamaDecoderLayer', | |
| gradient_accumulation_steps=1, | |
| gradient_checkpointing=True, | |
| learning_rate=2e-05, | |
| logging_steps=10, | |
| lr_scheduler_type='cosine', | |
| num_train_epochs=50, | |
| output_dir='./exp/reaction_4.2.2-large', | |
| overwrite_output_dir=False, | |
| per_device_eval_batch_size=4, | |
| per_device_train_batch_size=4, | |
| predict_with_generate=True, | |
| remove_unused_columns=False, | |
| report_to='none', | |
| save_steps=10000, | |
| save_strategy='steps', | |
| save_total_limit=1, | |
| seed=42, | |
| tf32=True, | |
| warmup_ratio=0.03, | |
| weight_decay=0.05) | |