Dataset Groups Activity Stream CogVideo CogVideo is a large-scale pretrained transformer for text-to-video generation. It is trained on a dataset of 5.4 million captioned videos with a spatial resolution of 160×160. BibTex: @dataset{Wenyi_Hong_and_Ming_Ding_and_Wendi_Zheng_and_Xinghan_Liu_and_Jie_Tang_2024, abstract = {CogVideo is a large-scale pretrained transformer for text-to-video generation. It is trained on a dataset of 5.4 million captioned videos with a spatial resolution of 160×160.}, author = {Wenyi Hong and Ming Ding and Wendi Zheng and Xinghan Liu and Jie Tang}, doi = {10.57702/elttxmzw}, institution = {No Organization}, keyword = {'text-to-video', 'transformer', 'video generation'}, month = {dec}, publisher = {TIB}, title = {CogVideo}, url = {https://service.tib.eu/ldmservice/dataset/cogvideo}, year = {2024} }