Dataset Groups Activity Stream arXMLiv 2018 The arXMLiv 2018 dataset is an HTML collection of the arXiv.org preprint archive, used as a training corpus for word embedding techniques. BibTex: @dataset{André_Greiner-Petter_and_Terry_Ruas_and_Moritz_Schubotz_and_Akiko_Aizawa_and_William_Grosky_and_Bela_Gipp_2024, abstract = {The arXMLiv 2018 dataset is an HTML collection of the arXiv.org preprint archive, used as a training corpus for word embedding techniques.}, author = {André Greiner-Petter and Terry Ruas and Moritz Schubotz and Akiko Aizawa and William Grosky and Bela Gipp}, doi = {10.57702/zauxa58e}, institution = {No Organization}, keyword = {'Formula Embedding', 'HTML', 'Information Retrieval', 'Mathematics', 'arXMLiv'}, month = {dec}, publisher = {TIB}, title = {arXMLiv 2018}, url = {https://service.tib.eu/ldmservice/dataset/arxmliv-2018}, year = {2024} }