Dataset Groups Activity Stream C4 The dataset used for pre-training language models, containing a large collection of text documents. BibTex: @dataset{Jesse_Dodge_and_Maarten_Sap_and_Ana_MarasoviĀ“c_and_William_Agnew_and_Gabriel_Ilharco_and_Dirk_Groeneveld_and_Margaret_Mitchell_and_Matt_Gardner_2024, abstract = {The dataset used for pre-training language models, containing a large collection of text documents.}, author = {Jesse Dodge and Maarten Sap and Ana MarasoviĀ“c and William Agnew and Gabriel Ilharco and Dirk Groeneveld and Margaret Mitchell and Matt Gardner}, doi = {10.57702/0wpldwvq}, institution = {No Organization}, keyword = {'Language Understanding', 'Large-Scale Dataset', 'dataset', 'evaluation', 'language model', 'language modeling', 'text classification', 'text document', 'text generation'}, month = {dec}, publisher = {TIB}, title = {C4}, url = {https://service.tib.eu/ldmservice/dataset/c4}, year = {2024} }