Dataset Groups Activity Stream SNLI-VE The dataset used in the paper is a set of sequential vision-and-language tasks, where each task consists of an image and a text input. BibTex: @dataset{Hangyu_Guo_and_Kun_Zhou_and_Wayne_Xin_Zhao_and_Qinyu_Zhang_and_Ji-Rong_Wen_2024, abstract = {The dataset used in the paper is a set of sequential vision-and-language tasks, where each task consists of an image and a text input.}, author = {Hangyu Guo and Kun Zhou and Wayne Xin Zhao and Qinyu Zhang and Ji-Rong Wen}, doi = {10.57702/sqbvn818}, institution = {No Organization}, keyword = {'Cross-modal Reasoning', 'multimodal input', 'sequential learning', 'vision-and-language tasks'}, month = {dec}, publisher = {TIB}, title = {SNLI-VE}, url = {https://service.tib.eu/ldmservice/dataset/snli-ve}, year = {2024} }