Dataset Groups Activity Stream MSRVTT-QA Video question answering (VideoQA) requires systems to understand the visual information and infer an answer for a natural language question from it. BibTex: @dataset{Zenan_Xu_and_Wanjun_Zhong_and_Qinliang_Su_and_Zijing_Ou_and_Fuwei_Zhang_2024, abstract = {Video question answering (VideoQA) requires systems to understand the visual information and infer an answer for a natural language question from it.}, author = {Zenan Xu and Wanjun Zhong and Qinliang Su and Zijing Ou and Fuwei Zhang}, doi = {10.57702/1gghtev5}, institution = {No Organization}, keyword = {'Multimodal Learning', 'Question Answering', 'Video Question Answering', 'VideoQA', 'msrvtt', 'multimodal learning', 'natural language question', 'video question answering', 'video-language', 'visual information'}, month = {dec}, publisher = {TIB}, title = {MSRVTT-QA}, url = {https://service.tib.eu/ldmservice/dataset/msrvtt-qa}, year = {2024} }