Dataset Groups Activity Stream InternVid: A Large-Scale Video-Text Dataset for Multimodal Understanding and Generation InternVid: A large-scale video-text dataset for multimodal understanding and generation. BibTex: @dataset{Conghui_He_and_Wei_Li_and_Zhenjiang_Jin_and_Chao_Xu_and_Bin_Wang_and_Dahua_Lin_2024, abstract = {InternVid: A large-scale video-text dataset for multimodal understanding and generation.}, author = {Conghui He and Wei Li and Zhenjiang Jin and Chao Xu and Bin Wang and Dahua Lin}, doi = {10.57702/4jomly5t}, institution = {No Organization}, keyword = {'Large Scale', 'Multimodal Learning', 'Video Analysis'}, month = {dec}, publisher = {TIB}, title = {InternVid: A Large-Scale Video-Text Dataset for Multimodal Understanding and Generation}, url = {https://service.tib.eu/ldmservice/dataset/internvid--a-large-scale-video-text-dataset-for-multimodal-understanding-and-generation}, year = {2024} }