Dataset Groups Activity Stream Multiscale Vision Transformers Multiscale Vision Transformers (MViT) for video and image recognition, by connecting the seminal idea of multiscale feature hierarchies with transformer models. BibTex: @dataset{Haoqi_Fan_and_Bo_Xiong_and_Karttikeya_Mangalam_and_Yanghao_Li_and_Zhicheng_Yan_and_Jitendra_Malik_and_Christoph_Feichtenhofer_2024, abstract = {Multiscale Vision Transformers (MViT) for video and image recognition, by connecting the seminal idea of multiscale feature hierarchies with transformer models.}, author = {Haoqi Fan and Bo Xiong and Karttikeya Mangalam and Yanghao Li and Zhicheng Yan and Jitendra Malik and Christoph Feichtenhofer}, doi = {10.57702/opc3c3ie}, institution = {No Organization}, keyword = {'Image Recognition', 'Multiscale Vision Transformers', 'Video Recognition'}, month = {dec}, publisher = {TIB}, title = {Multiscale Vision Transformers}, url = {https://service.tib.eu/ldmservice/dataset/multiscale-vision-transformers}, year = {2024} }