Dataset Groups Activity Stream VisSpeech The dataset used for the audio-visual speech recognition task, which consists of instructional videos with semantically related visual content. BibTex: @dataset{Puyuan_Peng_and_Brian_Yan_and_Shinji_Watanabe_and_David_Harwath_2025, abstract = {The dataset used for the audio-visual speech recognition task, which consists of instructional videos with semantically related visual content.}, author = {Puyuan Peng and Brian Yan and Shinji Watanabe and David Harwath}, doi = {10.57702/ct0blch5}, institution = {No Organization}, keyword = {'CLIP model', 'object vocabulary', 'visually-conditioned prompt'}, month = {jan}, publisher = {TIB}, title = {VisSpeech}, url = {https://service.tib.eu/ldmservice/dataset/visspeech}, year = {2025} }