Dataset Groups Activity Stream CC3M, SBU Captions, Visual Genome, and COCO The dataset used in the paper is a combination of CC3M, SBU Captions, Visual Genome, and COCO. BibTex: @dataset{Jiho_Jang_and_Chaerin_Kong_and_Donghyeon_Jeon_and_Seonhoon_Kim_and_Nojun_Kwak_2024, abstract = {The dataset used in the paper is a combination of CC3M, SBU Captions, Visual Genome, and COCO.}, author = {Jiho Jang and Chaerin Kong and Donghyeon Jeon and Seonhoon Kim and Nojun Kwak}, doi = {10.57702/n2m476ae}, institution = {No Organization}, keyword = {'image-text pairs', 'pretraining', 'vision-language learning'}, month = {dec}, publisher = {TIB}, title = {CC3M, SBU Captions, Visual Genome, and COCO}, url = {https://service.tib.eu/ldmservice/dataset/cc3m--sbu-captions--visual-genome--and-coco}, year = {2024} }