Dataset Groups Activity Stream CommonCrawl CommonCrawl is a non-profit organization that provides a large corpus of web pages for research and development purposes. BibTex: @dataset{Hu_Xu_and_Saining_Xie_and_Xiaoqing_Ellen_Tan_and_Po-Yao_Huang_and_Russell_Howes_and_Vasu_Sharma_and_Shang-Wen_Li_and_Gargi_Ghosh_and_Luke_Zettlemoyer_and_Christoph_Feichtenhofer_2024, abstract = {CommonCrawl is a non-profit organization that provides a large corpus of web pages for research and development purposes.}, author = {Hu Xu and Saining Xie and Xiaoqing Ellen Tan and Po-Yao Huang and Russell Howes and Vasu Sharma and Shang-Wen Li and Gargi Ghosh and Luke Zettlemoyer and Christoph Feichtenhofer}, doi = {10.57702/ygkh0gji}, institution = {No Organization}, keyword = {'Corpora', 'Corpus', 'Data Collection', 'Text Data', 'Web Pages', 'commoncrawl', 'document classification', 'text data', 'text pre-training', 'web scraping'}, month = {dec}, publisher = {TIB}, title = {CommonCrawl}, url = {https://service.tib.eu/ldmservice/dataset/commoncrawl}, year = {2024} }