Dataset Groups Activity Stream S2ORC A collection of 81.1 million scholarly publications in English from various academic fields, used to pre-train a language model. BibTex: @dataset{Lo_K_and_Wang_L_L_and_Neumann_M_and_Kinney_R_and_Weld_D_S_2024, abstract = {A collection of 81.1 million scholarly publications in English from various academic fields, used to pre-train a language model.}, author = {Lo, K. and Wang, L. L. and Neumann, M. and Kinney, R. and Weld, D. S.}, doi = {10.57702/g2wuqc2w}, institution = {No Organization}, keyword = {'ACL Anthology', 'Open Research', 'Research Corpus', 'Semantic Scholar', 'academic articles', 'academic fields', 'corpus', 'full-text PDF parses', 'materials science', 'molecular biology', 'scholarly publications', 'scientific text'}, month = {dec}, publisher = {TIB}, title = {S2ORC}, url = {https://service.tib.eu/ldmservice/dataset/s2orc}, year = {2024} }