Dataset Groups Activity Stream The Pile The Pile dataset contains 3.5 million samples of diverse text for language modeling. BibTex: @dataset{Jaap_Jumelet_and_Lisa_Bylinina_and_Willem_Zuidema_and_Jakub_Szymanik_2024, abstract = {The Pile dataset contains 3.5 million samples of diverse text for language modeling.}, author = {Jaap Jumelet and Lisa Bylinina and Willem Zuidema and Jakub Szymanik}, doi = {10.57702/q45kb0rx}, institution = {No Organization}, keyword = {'Diverse Text', 'Language Modeling', 'Text Data', 'corpus', 'language modeling', 'language models', 'text', 'text analysis'}, month = {dec}, publisher = {TIB}, title = {The Pile}, url = {https://service.tib.eu/ldmservice/dataset/the-pile}, year = {2024} }