Dataset Groups Activity Stream WSJ The WSJ corpus is a large vocabulary continuous speech recognition dataset. It contains 36416 sequences, representing around 80 hours of speech. BibTex: @dataset{Andros_Tjandra_and_Sakriani_Sakti_and_Satoshi_Nakamura_2024, abstract = {The WSJ corpus is a large vocabulary continuous speech recognition dataset. It contains 36416 sequences, representing around 80 hours of speech.}, author = {Andros Tjandra and Sakriani Sakti and Satoshi Nakamura}, doi = {10.57702/5n00l3tl}, institution = {No Organization}, keyword = {'ASR', 'Audio Data', 'Continuous Speech Recognition', 'Corpus', 'Encoder-Decoder', 'Large Vocabulary', 'Sequence-to-Sequence', 'Speech Recognition', 'TTS', 'Unlabeled Data', 'WSJ'}, month = {dec}, publisher = {TIB}, title = {WSJ}, url = {https://service.tib.eu/ldmservice/dataset/wsj}, year = {2024} }