Dataset Groups Activity Stream Pile The Pile dataset consists of 800GB text from 22 domains. Cynical selection naturally prefers text data based on the target corpus. BibTex: @dataset{L_Gao_and_S_Biderman_and_S_Black_and_L_Golding_and_T_Hoppe_and_C_Foster_and_J_Phang_and_H_He_and_A_Thite_and_N_Nabeshima_2025, abstract = {The Pile dataset consists of 800GB text from 22 domains. Cynical selection naturally prefers text data based on the target corpus.}, author = {L. Gao and S. Biderman and S. Black and L. Golding and T. Hoppe and C. Foster and J. Phang and H. He and A. Thite and N. Nabeshima}, doi = {10.57702/vo9gf0k6}, institution = {No Organization}, keyword = {'Domain Adaptation', 'Text Data', 'question answering', 'single entity', 'single relation'}, month = {jan}, publisher = {TIB}, title = {Pile}, url = {https://service.tib.eu/ldmservice/dataset/pile}, year = {2025} }