Dataset Groups Activity Stream Proof-Pile-2 The dataset used for continual pre-training of large language models, with a focus on balancing the text distribution and mitigating overfitting. BibTex: @dataset{Yunfan_Shao_and_Linyang_Li_and_Zhaoye_Fei_and_Hang_Yan_and_Dahua_Lin_and_Xipeng_Qiu_2024, abstract = {The dataset used for continual pre-training of large language models, with a focus on balancing the text distribution and mitigating overfitting.}, author = {Yunfan Shao and Linyang Li and Zhaoye Fei and Hang Yan and Dahua Lin and Xipeng Qiu}, doi = {10.57702/7iiqekx3}, institution = {No Organization}, keyword = {'continual pre-training', 'large language models', 'text distribution'}, month = {dec}, publisher = {TIB}, title = {Proof-Pile-2}, url = {https://service.tib.eu/ldmservice/dataset/proof-pile-2}, year = {2024} }