Dataset Groups Activity Stream Billion Word Benchmark Dataset The dataset contains 768M tokens for language modeling. BibTex: @dataset{Hassan_et_al_2024, abstract = {The dataset contains 768M tokens for language modeling.}, author = {Hassan et al.}, doi = {10.57702/bprj7ycm}, institution = {No Organization}, keyword = {'Billion Word', 'Language Modeling'}, month = {dec}, publisher = {TIB}, title = {Billion Word Benchmark Dataset}, url = {https://service.tib.eu/ldmservice/dataset/billion-word-benchmark-dataset}, year = {2024} }