Changes
On December 3, 2024 at 10:00:41 AM UTC, admin:
-
Changed value of field
doi_status
toTrue
in FastSpeech: Fast, Robust and Controllable Text to Speech -
Changed value of field
doi_date_published
to2024-12-03
in FastSpeech: Fast, Robust and Controllable Text to Speech -
Added resource Original Metadata to FastSpeech: Fast, Robust and Controllable Text to Speech
f | 1 | { | f | 1 | { |
2 | "access_rights": "", | 2 | "access_rights": "", | ||
3 | "author": "Yi Ren", | 3 | "author": "Yi Ren", | ||
4 | "author_email": "", | 4 | "author_email": "", | ||
5 | "citation": [], | 5 | "citation": [], | ||
6 | "creator_user_id": "17755db4-395a-4b3b-ac09-e8e3484ca700", | 6 | "creator_user_id": "17755db4-395a-4b3b-ac09-e8e3484ca700", | ||
7 | "defined_in": "", | 7 | "defined_in": "", | ||
8 | "doi": "10.57702/jd8hw0cw", | 8 | "doi": "10.57702/jd8hw0cw", | ||
n | 9 | "doi_date_published": null, | n | 9 | "doi_date_published": "2024-12-03", |
10 | "doi_publisher": "TIB", | 10 | "doi_publisher": "TIB", | ||
n | 11 | "doi_status": false, | n | 11 | "doi_status": true, |
12 | "domain": "https://service.tib.eu/ldmservice", | 12 | "domain": "https://service.tib.eu/ldmservice", | ||
13 | "extra_authors": [ | 13 | "extra_authors": [ | ||
14 | { | 14 | { | ||
15 | "extra_author": "Zhejiang University", | 15 | "extra_author": "Zhejiang University", | ||
16 | "orcid": "" | 16 | "orcid": "" | ||
17 | }, | 17 | }, | ||
18 | { | 18 | { | ||
19 | "extra_author": "Yangjun Ruan", | 19 | "extra_author": "Yangjun Ruan", | ||
20 | "orcid": "" | 20 | "orcid": "" | ||
21 | }, | 21 | }, | ||
22 | { | 22 | { | ||
23 | "extra_author": "Zhejiang University", | 23 | "extra_author": "Zhejiang University", | ||
24 | "orcid": "" | 24 | "orcid": "" | ||
25 | }, | 25 | }, | ||
26 | { | 26 | { | ||
27 | "extra_author": "Xu Tan", | 27 | "extra_author": "Xu Tan", | ||
28 | "orcid": "" | 28 | "orcid": "" | ||
29 | }, | 29 | }, | ||
30 | { | 30 | { | ||
31 | "extra_author": "Microsoft Research", | 31 | "extra_author": "Microsoft Research", | ||
32 | "orcid": "" | 32 | "orcid": "" | ||
33 | }, | 33 | }, | ||
34 | { | 34 | { | ||
35 | "extra_author": "Tao Qin", | 35 | "extra_author": "Tao Qin", | ||
36 | "orcid": "" | 36 | "orcid": "" | ||
37 | }, | 37 | }, | ||
38 | { | 38 | { | ||
39 | "extra_author": "Microsoft Research", | 39 | "extra_author": "Microsoft Research", | ||
40 | "orcid": "" | 40 | "orcid": "" | ||
41 | }, | 41 | }, | ||
42 | { | 42 | { | ||
43 | "extra_author": "Sheng Zhao", | 43 | "extra_author": "Sheng Zhao", | ||
44 | "orcid": "" | 44 | "orcid": "" | ||
45 | }, | 45 | }, | ||
46 | { | 46 | { | ||
47 | "extra_author": "Microsoft STC Asia", | 47 | "extra_author": "Microsoft STC Asia", | ||
48 | "orcid": "" | 48 | "orcid": "" | ||
49 | }, | 49 | }, | ||
50 | { | 50 | { | ||
51 | "extra_author": "Zhou Zhao", | 51 | "extra_author": "Zhou Zhao", | ||
52 | "orcid": "" | 52 | "orcid": "" | ||
53 | }, | 53 | }, | ||
54 | { | 54 | { | ||
55 | "extra_author": "Zhejiang University", | 55 | "extra_author": "Zhejiang University", | ||
56 | "orcid": "" | 56 | "orcid": "" | ||
57 | }, | 57 | }, | ||
58 | { | 58 | { | ||
59 | "extra_author": "Tie-Yan Liu", | 59 | "extra_author": "Tie-Yan Liu", | ||
60 | "orcid": "" | 60 | "orcid": "" | ||
61 | }, | 61 | }, | ||
62 | { | 62 | { | ||
63 | "extra_author": "Microsoft Research", | 63 | "extra_author": "Microsoft Research", | ||
64 | "orcid": "" | 64 | "orcid": "" | ||
65 | } | 65 | } | ||
66 | ], | 66 | ], | ||
67 | "groups": [ | 67 | "groups": [ | ||
68 | { | 68 | { | ||
69 | "description": "", | 69 | "description": "", | ||
70 | "display_name": "Text-to-Speech", | 70 | "display_name": "Text-to-Speech", | ||
71 | "id": "486fb9b3-8453-4c88-bd0d-b9d59c07299c", | 71 | "id": "486fb9b3-8453-4c88-bd0d-b9d59c07299c", | ||
72 | "image_display_url": "", | 72 | "image_display_url": "", | ||
73 | "name": "text-to-speech", | 73 | "name": "text-to-speech", | ||
74 | "title": "Text-to-Speech" | 74 | "title": "Text-to-Speech" | ||
75 | } | 75 | } | ||
76 | ], | 76 | ], | ||
77 | "id": "bb16220b-f154-4348-9b62-d42092c35995", | 77 | "id": "bb16220b-f154-4348-9b62-d42092c35995", | ||
78 | "isopen": false, | 78 | "isopen": false, | ||
79 | "landing_page": "https://speechresearch.github.io/fastspeech/", | 79 | "landing_page": "https://speechresearch.github.io/fastspeech/", | ||
80 | "license_title": null, | 80 | "license_title": null, | ||
81 | "link_orkg": "", | 81 | "link_orkg": "", | ||
82 | "metadata_created": "2024-12-03T10:00:40.083656", | 82 | "metadata_created": "2024-12-03T10:00:40.083656", | ||
n | 83 | "metadata_modified": "2024-12-03T10:00:40.083661", | n | 83 | "metadata_modified": "2024-12-03T10:00:40.537720", |
84 | "name": "fastspeech--fast--robust-and-controllable-text-to-speech", | 84 | "name": "fastspeech--fast--robust-and-controllable-text-to-speech", | ||
85 | "notes": "Neural network based end-to-end text to speech (TTS) has | 85 | "notes": "Neural network based end-to-end text to speech (TTS) has | ||
86 | signi\ufb01cantly improved the quality of synthesized speech. | 86 | signi\ufb01cantly improved the quality of synthesized speech. | ||
87 | Prominent methods (e.g., Tacotron 2) usually \ufb01rst generate | 87 | Prominent methods (e.g., Tacotron 2) usually \ufb01rst generate | ||
88 | mel-spectrogram from text, and then synthesize speech from the | 88 | mel-spectrogram from text, and then synthesize speech from the | ||
89 | mel-spectrogram using vocoder such as WaveNet. Compared with | 89 | mel-spectrogram using vocoder such as WaveNet. Compared with | ||
90 | traditional concatenative and statistical parametric approaches, | 90 | traditional concatenative and statistical parametric approaches, | ||
91 | neural network based end-to-end models suffer from slow inference | 91 | neural network based end-to-end models suffer from slow inference | ||
92 | speed, and the synthesized speech is usually not robust (i.e., some | 92 | speed, and the synthesized speech is usually not robust (i.e., some | ||
93 | words are skipped or repeated) and lack of con-trollability (voice | 93 | words are skipped or repeated) and lack of con-trollability (voice | ||
94 | speed or prosody control). In this work, we propose a novel | 94 | speed or prosody control). In this work, we propose a novel | ||
95 | feed-forward network based on Transformer to generate mel-spectrogram | 95 | feed-forward network based on Transformer to generate mel-spectrogram | ||
96 | in parallel for TTS.", | 96 | in parallel for TTS.", | ||
n | 97 | "num_resources": 0, | n | 97 | "num_resources": 1, |
98 | "num_tags": 4, | 98 | "num_tags": 4, | ||
99 | "organization": { | 99 | "organization": { | ||
100 | "approval_status": "approved", | 100 | "approval_status": "approved", | ||
101 | "created": "2024-11-25T12:11:38.292601", | 101 | "created": "2024-11-25T12:11:38.292601", | ||
102 | "description": "", | 102 | "description": "", | ||
103 | "id": "079d46db-32df-4b48-91f3-0a8bc8f69559", | 103 | "id": "079d46db-32df-4b48-91f3-0a8bc8f69559", | ||
104 | "image_url": "", | 104 | "image_url": "", | ||
105 | "is_organization": true, | 105 | "is_organization": true, | ||
106 | "name": "no-organization", | 106 | "name": "no-organization", | ||
107 | "state": "active", | 107 | "state": "active", | ||
108 | "title": "No Organization", | 108 | "title": "No Organization", | ||
109 | "type": "organization" | 109 | "type": "organization" | ||
110 | }, | 110 | }, | ||
111 | "owner_org": "079d46db-32df-4b48-91f3-0a8bc8f69559", | 111 | "owner_org": "079d46db-32df-4b48-91f3-0a8bc8f69559", | ||
112 | "private": false, | 112 | "private": false, | ||
113 | "relationships_as_object": [], | 113 | "relationships_as_object": [], | ||
114 | "relationships_as_subject": [], | 114 | "relationships_as_subject": [], | ||
t | 115 | "resources": [], | t | 115 | "resources": [ |
116 | { | ||||
117 | "cache_last_updated": null, | ||||
118 | "cache_url": null, | ||||
119 | "created": "2024-12-03T10:49:30", | ||||
120 | "data": [ | ||||
121 | "dcterms:title", | ||||
122 | "dcterms:accessRights", | ||||
123 | "dcterms:creator", | ||||
124 | "dcterms:description", | ||||
125 | "dcterms:issued", | ||||
126 | "dcterms:language", | ||||
127 | "dcterms:identifier", | ||||
128 | "dcat:theme", | ||||
129 | "dcterms:type", | ||||
130 | "dcat:keyword", | ||||
131 | "dcat:landingPage", | ||||
132 | "dcterms:hasVersion", | ||||
133 | "dcterms:format", | ||||
134 | "mls:task" | ||||
135 | ], | ||||
136 | "description": "The json representation of the dataset with its | ||||
137 | distributions based on DCAT.", | ||||
138 | "format": "JSON", | ||||
139 | "hash": "", | ||||
140 | "id": "a9177ae7-e719-48b3-88a4-5ad80d65d786", | ||||
141 | "last_modified": "2024-12-03T10:00:40.529295", | ||||
142 | "metadata_modified": "2024-12-03T10:00:40.540484", | ||||
143 | "mimetype": "application/json", | ||||
144 | "mimetype_inner": null, | ||||
145 | "name": "Original Metadata", | ||||
146 | "package_id": "bb16220b-f154-4348-9b62-d42092c35995", | ||||
147 | "position": 0, | ||||
148 | "resource_type": null, | ||||
149 | "size": 1530, | ||||
150 | "state": "active", | ||||
151 | "url": | ||||
152 | resource/a9177ae7-e719-48b3-88a4-5ad80d65d786/download/metadata.json", | ||||
153 | "url_type": "upload" | ||||
154 | } | ||||
155 | ], | ||||
116 | "services_used_list": "", | 156 | "services_used_list": "", | ||
117 | "state": "active", | 157 | "state": "active", | ||
118 | "tags": [ | 158 | "tags": [ | ||
119 | { | 159 | { | ||
120 | "display_name": "Transformer", | 160 | "display_name": "Transformer", | ||
121 | "id": "4a4cb187-5e01-464d-b8e9-4b95a87809f4", | 161 | "id": "4a4cb187-5e01-464d-b8e9-4b95a87809f4", | ||
122 | "name": "Transformer", | 162 | "name": "Transformer", | ||
123 | "state": "active", | 163 | "state": "active", | ||
124 | "vocabulary_id": null | 164 | "vocabulary_id": null | ||
125 | }, | 165 | }, | ||
126 | { | 166 | { | ||
127 | "display_name": "parallel mel-spectrogram generation", | 167 | "display_name": "parallel mel-spectrogram generation", | ||
128 | "id": "6d8a0936-e93f-43e5-85de-c605c5f0fe90", | 168 | "id": "6d8a0936-e93f-43e5-85de-c605c5f0fe90", | ||
129 | "name": "parallel mel-spectrogram generation", | 169 | "name": "parallel mel-spectrogram generation", | ||
130 | "state": "active", | 170 | "state": "active", | ||
131 | "vocabulary_id": null | 171 | "vocabulary_id": null | ||
132 | }, | 172 | }, | ||
133 | { | 173 | { | ||
134 | "display_name": "speech synthesis", | 174 | "display_name": "speech synthesis", | ||
135 | "id": "499bf7ec-2e42-4899-bdb2-55134754d900", | 175 | "id": "499bf7ec-2e42-4899-bdb2-55134754d900", | ||
136 | "name": "speech synthesis", | 176 | "name": "speech synthesis", | ||
137 | "state": "active", | 177 | "state": "active", | ||
138 | "vocabulary_id": null | 178 | "vocabulary_id": null | ||
139 | }, | 179 | }, | ||
140 | { | 180 | { | ||
141 | "display_name": "text to speech", | 181 | "display_name": "text to speech", | ||
142 | "id": "ac666b23-d181-4a43-95dd-66ac96d940e7", | 182 | "id": "ac666b23-d181-4a43-95dd-66ac96d940e7", | ||
143 | "name": "text to speech", | 183 | "name": "text to speech", | ||
144 | "state": "active", | 184 | "state": "active", | ||
145 | "vocabulary_id": null | 185 | "vocabulary_id": null | ||
146 | } | 186 | } | ||
147 | ], | 187 | ], | ||
148 | "title": "FastSpeech: Fast, Robust and Controllable Text to Speech", | 188 | "title": "FastSpeech: Fast, Robust and Controllable Text to Speech", | ||
149 | "type": "dataset", | 189 | "type": "dataset", | ||
150 | "version": "" | 190 | "version": "" | ||
151 | } | 191 | } |