Dataset Groups Activity Stream NLVR2 The dataset used in the paper is a set of sequential vision-and-language tasks, where each task consists of an image and a text input. BibTex: @dataset{Alane_Suhr_and_Stephanie_Zhou_and_Ally_Zhang_and_Iris_Zhang_and_Hua-jun_Bai_2024, abstract = {The dataset used in the paper is a set of sequential vision-and-language tasks, where each task consists of an image and a text input.}, author = {Alane Suhr and Stephanie Zhou and Ally Zhang and Iris Zhang and Hua-jun Bai}, doi = {10.57702/jlpfsu64}, institution = {No Organization}, keyword = {'Image', 'Text', 'Visual Question Answering', 'image-text pairs', 'natural language processing', 'visual reasoning'}, month = {nov}, publisher = {TIB}, title = {NLVR2}, url = {https://service.tib.eu/ldmservice/dataset/nlvr2}, year = {2024} }