Dataset Groups Activity Stream Audio-Visual Question Answering Audio-visual question answering (AVQA) requires reference to video content and auditory information, followed by correlating the question to predict the most precise answer. BibTex: @dataset{Qilang_Ye_and_Zitong_Yu_and_Xin_Liu_2024, abstract = {Audio-visual question answering (AVQA) requires reference to video content and auditory information, followed by correlating the question to predict the most precise answer.}, author = {Qilang Ye and Zitong Yu and Xin Liu}, doi = {10.57702/266xhsry}, institution = {No Organization}, keyword = {'audio-visual', 'multimodal learning', 'question answering'}, month = {dec}, publisher = {TIB}, title = {Audio-Visual Question Answering}, url = {https://service.tib.eu/ldmservice/dataset/audio-visual-question-answering}, year = {2024} }