Dataset Groups Activity Stream Universal and transferable adversarial attacks on aligned language models AdvBench is a dataset for evaluating the safety of large language models. BibTex: @dataset{Andy_Zou_and_Zifan_Wang_and_J_Zico_Kolter_and_Matt_Fredrikson_2024, abstract = {AdvBench is a dataset for evaluating the safety of large language models.}, author = {Andy Zou and Zifan Wang and J. Zico Kolter and Matt Fredrikson}, doi = {10.57702/2oo2r02d}, institution = {No Organization}, keyword = {'large language models', 'question answering', 'relation linking', 'safety', 'single entity', 'single relation'}, month = {dec}, publisher = {TIB}, title = {Universal and transferable adversarial attacks on aligned language models}, url = {https://service.tib.eu/ldmservice/dataset/universal-and-transferable-adversarial-attacks-on-aligned-language-models}, year = {2024} }