Dataset Groups Activity Stream Places The dataset used in the paper is Places, a large dataset of 400k pairs of images from the Places 205 dataset and corresponding spoken audio captions. BibTex: @dataset{Bolei_Zhou_and_Aditya_Khosla_and_Agata_Lapedriza_and_Aude_Oliva_and_Antonio_Torralba_2024, abstract = {The dataset used in the paper is Places, a large dataset of 400k pairs of images from the Places 205 dataset and corresponding spoken audio captions.}, author = {Bolei Zhou and Aditya Khosla and Agata Lapedriza and Aude Oliva and Antonio Torralba}, doi = {10.57702/dwtciyo9}, institution = {No Organization}, keyword = {'Computer Vision', 'Image Classification', 'Image Recognition', 'Large Scale', 'Places', 'Scene Recognition', 'Scene Understanding', 'categories', 'cross-modal learning', 'image classification', 'image-audio', 'image-audio retrieval'}, month = {dec}, publisher = {TIB}, title = {Places}, url = {https://service.tib.eu/ldmservice/dataset/places}, year = {2024} }