Dataset Groups Activity Stream Video-LLaMA: An instruction-tuned audio-visual language model for video understanding A video-LLaMA model for video understanding, comprising 100k videos with detailed captions. BibTex: @dataset{Hang_Zhang_and_Xin_Li_and_Lidong_Bing_2024, abstract = {A video-LLaMA model for video understanding, comprising 100k videos with detailed captions.}, author = {Hang Zhang and Xin Li and Lidong Bing}, doi = {10.57702/ztz8frfm}, institution = {No Organization}, keyword = {'instruction-following', 'multimodal learning', 'video captioning', 'video understanding'}, month = {dec}, publisher = {TIB}, title = {Video-LLaMA: An instruction-tuned audio-visual language model for video understanding}, url = {https://service.tib.eu/ldmservice/dataset/video-llama--an-instruction-tuned-audio-visual-language-model-for-video-understanding}, year = {2024} }