Dataset Groups Activity Stream COCOQA The dataset used in the paper is a set of sequential vision-and-language tasks, where each task consists of an image and a text input. BibTex: @dataset{Yuliang_Cai_and_Jesse_Thomason_and_Mohammad_Rostami_2024, abstract = {The dataset used in the paper is a set of sequential vision-and-language tasks, where each task consists of an image and a text input.}, author = {Yuliang Cai and Jesse Thomason and Mohammad Rostami}, doi = {10.57702/7y17vs75}, institution = {No Organization}, keyword = {'image-text pairs', 'question answering'}, month = {dec}, publisher = {TIB}, title = {COCOQA}, url = {https://service.tib.eu/ldmservice/dataset/cocoqa}, year = {2024} }