Dataset Groups Activity Stream Multimodal C4 (mmc4) Multimodal C4 (mmc4) is a public, billion-scale corpus of images and text, constructed from public webpages contained in the cleaned English c4 corpus. BibTex: @dataset{Wanrong_Zhu_and_Jack_Hessel_and_Anas_Awadalla_and_Samir_Yitzhak_Gadre_and_Jesse_Dodge_and_Alex_Fang_and_Youngjae_Yu_and_Ludwig_Schmidt_and_William_Yang_Wang_and_Yejin_Choi_2024, abstract = {Multimodal C4 (mmc4) is a public, billion-scale corpus of images and text, constructed from public webpages contained in the cleaned English c4 corpus.}, author = {Wanrong Zhu and Jack Hessel and Anas Awadalla and Samir Yitzhak Gadre and Jesse Dodge and Alex Fang and Youngjae Yu and Ludwig Schmidt and William Yang Wang and Yejin Choi}, doi = {10.57702/7wpd3r0e}, institution = {No Organization}, keyword = {'image-text correlation', 'large-scale dataset', 'multimodal learning'}, month = {dec}, publisher = {TIB}, title = {Multimodal C4 (mmc4)}, url = {https://service.tib.eu/ldmservice/dataset/multimodal-c4--mmc4-}, year = {2024} }