Dataset Groups Activity Stream ClipCap: CLIP Prefix for Image Captioning Image captioning is a fundamental task in vision-language understanding, where the model predicts a textual informative caption to a given input image. BibTex: @dataset{Ron_Mokady_and_Amir_Hertz_and_Amit_H_Bermano_2024, abstract = {Image captioning is a fundamental task in vision-language understanding, where the model predicts a textual informative caption to a given input image.}, author = {Ron Mokady and Amir Hertz and Amit H. Bermano}, doi = {10.57702/15ie8lak}, institution = {No Organization}, keyword = {'CLIP', 'image captioning', 'vision-language understanding'}, month = {dec}, publisher = {TIB}, title = {ClipCap: CLIP Prefix for Image Captioning}, url = {https://service.tib.eu/ldmservice/dataset/clipcap--clip-pre-x-for-image-captioning}, year = {2024} }