Dataset Groups Activity Stream OpenWebText Corpus A dataset for language modeling, where the goal is to predict the next word in a sequence given the previous words. BibTex: @dataset{Subham_Sekhar_Sahoo_and_Aaron_Gokaslan_and_Chris_De_Sa_and_Volodymyr_Kuleshov_2024, abstract = {A dataset for language modeling, where the goal is to predict the next word in a sequence given the previous words.}, author = {Subham Sekhar Sahoo and Aaron Gokaslan and Chris De Sa and Volodymyr Kuleshov}, doi = {10.57702/yw8o2eqh}, institution = {No Organization}, keyword = {'OpenWebText', 'Text Classification', 'Web Scraping', 'language modeling', 'sequence prediction', 'text generation'}, month = {dec}, publisher = {TIB}, title = {OpenWebText Corpus}, url = {https://service.tib.eu/ldmservice/dataset/openwebtext-corpus}, year = {2024} }