Spaces:
Runtime error
Runtime error
| import webdataset as wds | |
| import os | |
| from tqdm import tqdm | |
| from PIL import Image | |
| from io import BytesIO | |
| import base64 | |
| OUT_DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/vqav2_train_wds" | |
| TOTAL = 1828467 | |
| if __name__ == "__main__": | |
| with wds.ShardWriter(os.path.join(OUT_DIR, "%06d.tar"), maxcount=10000) as sink: | |
| sink.verbose = False | |
| f = open("/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/vqav2_ofa/vqa_train.tsv") | |
| for data in tqdm(f, total=TOTAL): | |
| data = data.rstrip().split("\t") | |
| id1 = data[0] | |
| id2 = data[1] | |
| question = data[2] | |
| answer = data[3].split("|!+")[-1] | |
| image = data[5] | |
| id3 = data[6] | |
| image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB") | |
| caption = f"Question: {question.strip()} Answer: {answer.strip()}" | |
| sink.write({"__key__": f"vqav2_{id1}_{id2}_{id3}", "jpg": image, "txt": caption}) |