Dataset Groups Activity Stream SpeechCLIP SpeechCLIP is a novel framework to integrate speech SSL models with a pre-trained vision and language model. BibTex: @dataset{Yi-Jen_Shih_and_Hsuan-Fu_Wang_and_Heng-Jui_Chang_and_Layne_Berry_and_Hung-yi_Lee_and_David_Harwath_2024, abstract = {SpeechCLIP is a novel framework to integrate speech SSL models with a pre-trained vision and language model.}, author = {Yi-Jen Shih and Hsuan-Fu Wang and Heng-Jui Chang and Layne Berry and Hung-yi Lee and David Harwath}, doi = {10.57702/dncqq38p}, institution = {No Organization}, keyword = {'image-speech retrieval', 'semantically related keywords', 'speech-text retrieval', 'zero-shot speech-text retrieval'}, month = {dec}, publisher = {TIB}, title = {SpeechCLIP}, url = {https://service.tib.eu/ldmservice/dataset/speechclip}, year = {2024} }