IMG_CONTEXT_TOKEN = '' IMG_START_TOKEN = '' IMG_END_TOKEN = '' IMG_LINE_BREAK_TOKEN = '' IMG_FRAME_BREAK_TOKEN = '' QUAD_START_TOKEN = '' QUAD_END_TOKEN = '' REF_START_TOKEN = '' REF_END_TOKEN = '' BOX_START_TOKEN = '' BOX_END_TOKEN = '' IMG_UNCOND_TOKEN = '' IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) CLIP_MEAN = (0.4814546, 0.4578275, 0.40821073) CLIP_STD = (0.2686295, 0.2613025, 0.2757711) SIGLIP_MEAN = (0.5, 0.5, 0.5) SIGLIP_STD = (0.5, 0.5, 0.5) VAE_MEAN = (0.5, 0.5, 0.5) VAE_STD = (0.5, 0.5, 0.5) SPECIAL_TOKEN_LIST = [ BOX_END_TOKEN, BOX_START_TOKEN, IMG_CONTEXT_TOKEN, IMG_END_TOKEN, IMG_START_TOKEN, QUAD_END_TOKEN, QUAD_START_TOKEN, REF_END_TOKEN, REF_START_TOKEN, IMG_UNCOND_TOKEN, IMG_LINE_BREAK_TOKEN, IMG_FRAME_BREAK_TOKEN, ]