Changes
On October 17, 2023 at 10:50:06 AM UTC, admin:
-
Added resource 10k rows source with 75% duplicate rate to SDM-Genomic-Dataset
f | 1 | { | f | 1 | { |
2 | "author": "Samaneh Jozashoori", | 2 | "author": "Samaneh Jozashoori", | ||
3 | "author_email": "iglesias@l3s.de", | 3 | "author_email": "iglesias@l3s.de", | ||
4 | "citations": [ | 4 | "citations": [ | ||
5 | { | 5 | { | ||
6 | "cited_in": "" | 6 | "cited_in": "" | ||
7 | } | 7 | } | ||
8 | ], | 8 | ], | ||
9 | "creator_user_id": "17755db4-395a-4b3b-ac09-e8e3484ca700", | 9 | "creator_user_id": "17755db4-395a-4b3b-ac09-e8e3484ca700", | ||
10 | "doi": "10.57702/4c9ivpgs", | 10 | "doi": "10.57702/4c9ivpgs", | ||
11 | "doi_date_published": "2023-10-17", | 11 | "doi_date_published": "2023-10-17", | ||
12 | "doi_publisher": "TIB", | 12 | "doi_publisher": "TIB", | ||
13 | "doi_status": true, | 13 | "doi_status": true, | ||
14 | "domain": "https://service.tib.eu/ldmservice", | 14 | "domain": "https://service.tib.eu/ldmservice", | ||
15 | "extra_authors": [ | 15 | "extra_authors": [ | ||
16 | { | 16 | { | ||
17 | "extra_author": "Enrique Iglesias", | 17 | "extra_author": "Enrique Iglesias", | ||
18 | "orcid": "0000-0002-8734-3123" | 18 | "orcid": "0000-0002-8734-3123" | ||
19 | }, | 19 | }, | ||
20 | { | 20 | { | ||
21 | "extra_author": "Maria-Esther Vidal", | 21 | "extra_author": "Maria-Esther Vidal", | ||
22 | "orcid": "0000-0003-1160-8727" | 22 | "orcid": "0000-0003-1160-8727" | ||
23 | } | 23 | } | ||
24 | ], | 24 | ], | ||
25 | "extras": [ | 25 | "extras": [ | ||
26 | { | 26 | { | ||
27 | "__extras": { | 27 | "__extras": { | ||
28 | "id": "24ed91ac-12e1-41ba-9f6b-c10eddce6b5e", | 28 | "id": "24ed91ac-12e1-41ba-9f6b-c10eddce6b5e", | ||
29 | "package_id": "5f2180c2-be1d-45c3-a3c3-831f400b2f81", | 29 | "package_id": "5f2180c2-be1d-45c3-a3c3-831f400b2f81", | ||
30 | "state": "active" | 30 | "state": "active" | ||
31 | }, | 31 | }, | ||
32 | "key": "", | 32 | "key": "", | ||
33 | "value": "" | 33 | "value": "" | ||
34 | } | 34 | } | ||
35 | ], | 35 | ], | ||
36 | "groups": [], | 36 | "groups": [], | ||
37 | "id": "5f2180c2-be1d-45c3-a3c3-831f400b2f81", | 37 | "id": "5f2180c2-be1d-45c3-a3c3-831f400b2f81", | ||
38 | "isopen": false, | 38 | "isopen": false, | ||
39 | "license_id": "notspecified", | 39 | "license_id": "notspecified", | ||
40 | "license_title": "License not specified", | 40 | "license_title": "License not specified", | ||
41 | "maintainer": "Enrique Iglesias", | 41 | "maintainer": "Enrique Iglesias", | ||
42 | "maintainer_email": "iglesias@l3s.de", | 42 | "maintainer_email": "iglesias@l3s.de", | ||
43 | "metadata_created": "2023-10-17T09:28:29.260955", | 43 | "metadata_created": "2023-10-17T09:28:29.260955", | ||
n | 44 | "metadata_modified": "2023-10-17T10:49:37.508429", | n | 44 | "metadata_modified": "2023-10-17T10:50:05.422930", |
45 | "name": "sdm-genomic-dataset", | 45 | "name": "sdm-genomic-dataset", | ||
46 | "notes": "This benchmark is created by randomly sampling data | 46 | "notes": "This benchmark is created by randomly sampling data | ||
47 | records from somatic mutation data collected in COSMIC | 47 | records from somatic mutation data collected in COSMIC | ||
48 | (https://cancer.sanger.ac.uk/cosmic). SDM-Genomic-Datasets include | 48 | (https://cancer.sanger.ac.uk/cosmic). SDM-Genomic-Datasets include | ||
49 | eight different logical sources of various sizes, including 10k, 100k, | 49 | eight different logical sources of various sizes, including 10k, 100k, | ||
50 | 1M, and 5M rows. For every pair of sources of the same size, they | 50 | 1M, and 5M rows. For every pair of sources of the same size, they | ||
51 | differ in the percentage of the data duplicate rate, which can be | 51 | differ in the percentage of the data duplicate rate, which can be | ||
52 | either 25% or 75%, where each duplicate value is repeated 20 times. | 52 | either 25% or 75%, where each duplicate value is repeated 20 times. | ||
53 | For example, a 10k logical source with 25% data duplicate rate has 75% | 53 | For example, a 10k logical source with 25% data duplicate rate has 75% | ||
54 | duplicate-free records (i.e., 7500 rows), and the rest of the 25% | 54 | duplicate-free records (i.e., 7500 rows), and the rest of the 25% | ||
55 | records (i.e., 2500 rows) correspond to 125 different records which | 55 | records (i.e., 2500 rows) correspond to 125 different records which | ||
56 | are duplicated 20 times; in total there are 7625 unique values. ", | 56 | are duplicated 20 times; in total there are 7625 unique values. ", | ||
n | 57 | "num_resources": 2, | n | 57 | "num_resources": 3, |
58 | "num_tags": 2, | 58 | "num_tags": 2, | ||
59 | "orcid": "0000-0003-1702-8707", | 59 | "orcid": "0000-0003-1702-8707", | ||
60 | "organization": { | 60 | "organization": { | ||
61 | "approval_status": "approved", | 61 | "approval_status": "approved", | ||
62 | "created": "2021-10-14T10:15:56.594472", | 62 | "created": "2021-10-14T10:15:56.594472", | ||
63 | "description": "# Forschungszentrum | 63 | "description": "# Forschungszentrum | ||
64 | L3S\r\n\r\nhttps://www.l3s.de/", | 64 | L3S\r\n\r\nhttps://www.l3s.de/", | ||
65 | "id": "00c8f767-81fd-41ae-9235-dd4d1d43221c", | 65 | "id": "00c8f767-81fd-41ae-9235-dd4d1d43221c", | ||
66 | "image_url": | 66 | "image_url": | ||
67 | "https://www.l3s.de/sites/default/files/L3S_Logo_NEU_small.jpg", | 67 | "https://www.l3s.de/sites/default/files/L3S_Logo_NEU_small.jpg", | ||
68 | "is_organization": true, | 68 | "is_organization": true, | ||
69 | "name": "l3s", | 69 | "name": "l3s", | ||
70 | "state": "active", | 70 | "state": "active", | ||
71 | "title": "L3S", | 71 | "title": "L3S", | ||
72 | "type": "organization" | 72 | "type": "organization" | ||
73 | }, | 73 | }, | ||
74 | "owner_org": "00c8f767-81fd-41ae-9235-dd4d1d43221c", | 74 | "owner_org": "00c8f767-81fd-41ae-9235-dd4d1d43221c", | ||
75 | "private": false, | 75 | "private": false, | ||
76 | "publications": [ | 76 | "publications": [ | ||
77 | { | 77 | { | ||
78 | "published_in": "" | 78 | "published_in": "" | ||
79 | } | 79 | } | ||
80 | ], | 80 | ], | ||
81 | "relationships_as_object": [], | 81 | "relationships_as_object": [], | ||
82 | "relationships_as_subject": [], | 82 | "relationships_as_subject": [], | ||
83 | "resources": [ | 83 | "resources": [ | ||
84 | { | 84 | { | ||
85 | "auto_update": "No", | 85 | "auto_update": "No", | ||
86 | "auto_update_last_update": "", | 86 | "auto_update_last_update": "", | ||
87 | "auto_update_url": "", | 87 | "auto_update_url": "", | ||
88 | "cache_last_updated": null, | 88 | "cache_last_updated": null, | ||
89 | "cache_url": null, | 89 | "cache_url": null, | ||
90 | "created": "2023-10-17T09:37:06.230719", | 90 | "created": "2023-10-17T09:37:06.230719", | ||
91 | "description": "", | 91 | "description": "", | ||
92 | "format": ".zip", | 92 | "format": ".zip", | ||
93 | "hash": "", | 93 | "hash": "", | ||
94 | "id": "d65a3ddc-ff78-4a5e-8863-cbecdc76d803", | 94 | "id": "d65a3ddc-ff78-4a5e-8863-cbecdc76d803", | ||
95 | "last_modified": "2023-10-17T09:37:06.200438", | 95 | "last_modified": "2023-10-17T09:37:06.200438", | ||
96 | "metadata_modified": "2023-10-17T09:37:06.220368", | 96 | "metadata_modified": "2023-10-17T09:37:06.220368", | ||
97 | "mimetype": "application/zip", | 97 | "mimetype": "application/zip", | ||
98 | "mimetype_inner": null, | 98 | "mimetype_inner": null, | ||
99 | "name": "SDM-Genomic-Dataset 10k and 100k rows Testbeds", | 99 | "name": "SDM-Genomic-Dataset 10k and 100k rows Testbeds", | ||
100 | "package_id": "5f2180c2-be1d-45c3-a3c3-831f400b2f81", | 100 | "package_id": "5f2180c2-be1d-45c3-a3c3-831f400b2f81", | ||
101 | "position": 0, | 101 | "position": 0, | ||
102 | "resource_type": null, | 102 | "resource_type": null, | ||
103 | "size": 160589043, | 103 | "size": 160589043, | ||
104 | "state": "active", | 104 | "state": "active", | ||
105 | "url": | 105 | "url": | ||
106 | 65a3ddc-ff78-4a5e-8863-cbecdc76d803/download/sdm-genomic-dataset.zip", | 106 | 65a3ddc-ff78-4a5e-8863-cbecdc76d803/download/sdm-genomic-dataset.zip", | ||
107 | "url_type": "upload" | 107 | "url_type": "upload" | ||
108 | }, | 108 | }, | ||
109 | { | 109 | { | ||
110 | "auto_update": "No", | 110 | "auto_update": "No", | ||
111 | "auto_update_last_update": "", | 111 | "auto_update_last_update": "", | ||
112 | "auto_update_url": "", | 112 | "auto_update_url": "", | ||
113 | "cache_last_updated": null, | 113 | "cache_last_updated": null, | ||
114 | "cache_url": null, | 114 | "cache_url": null, | ||
115 | "created": "2023-10-17T10:49:37.520638", | 115 | "created": "2023-10-17T10:49:37.520638", | ||
116 | "description": "", | 116 | "description": "", | ||
117 | "format": "CSV", | 117 | "format": "CSV", | ||
118 | "hash": "", | 118 | "hash": "", | ||
119 | "id": "e6aeaf20-e3ab-446c-af56-8a1d17578a75", | 119 | "id": "e6aeaf20-e3ab-446c-af56-8a1d17578a75", | ||
120 | "last_modified": "2023-10-17T10:49:37.499916", | 120 | "last_modified": "2023-10-17T10:49:37.499916", | ||
121 | "metadata_modified": "2023-10-17T10:49:37.512453", | 121 | "metadata_modified": "2023-10-17T10:49:37.512453", | ||
122 | "mimetype": "text/csv", | 122 | "mimetype": "text/csv", | ||
123 | "mimetype_inner": null, | 123 | "mimetype_inner": null, | ||
124 | "name": "10k rows source with 25% duplicate rate", | 124 | "name": "10k rows source with 25% duplicate rate", | ||
125 | "package_id": "5f2180c2-be1d-45c3-a3c3-831f400b2f81", | 125 | "package_id": "5f2180c2-be1d-45c3-a3c3-831f400b2f81", | ||
126 | "position": 1, | 126 | "position": 1, | ||
127 | "resource_type": null, | 127 | "resource_type": null, | ||
128 | "size": 1788734, | 128 | "size": 1788734, | ||
129 | "state": "active", | 129 | "state": "active", | ||
130 | "url": | 130 | "url": | ||
131 | records_with_duplicate_and_each_duplicate_being_repeated_20times.csv", | 131 | records_with_duplicate_and_each_duplicate_being_repeated_20times.csv", | ||
132 | "url_type": "upload" | 132 | "url_type": "upload" | ||
t | t | 133 | }, | ||
134 | { | ||||
135 | "auto_update": "No", | ||||
136 | "auto_update_last_update": "", | ||||
137 | "auto_update_url": "", | ||||
138 | "cache_last_updated": null, | ||||
139 | "cache_url": null, | ||||
140 | "created": "2023-10-17T10:50:05.433365", | ||||
141 | "description": "", | ||||
142 | "format": "CSV", | ||||
143 | "hash": "", | ||||
144 | "id": "845052d6-27f5-47b7-a012-2df643bb5bc7", | ||||
145 | "last_modified": "2023-10-17T10:50:05.414532", | ||||
146 | "metadata_modified": "2023-10-17T10:50:05.426325", | ||||
147 | "mimetype": "text/csv", | ||||
148 | "mimetype_inner": null, | ||||
149 | "name": "10k rows source with 75% duplicate rate", | ||||
150 | "package_id": "5f2180c2-be1d-45c3-a3c3-831f400b2f81", | ||||
151 | "position": 2, | ||||
152 | "resource_type": null, | ||||
153 | "size": 1801842, | ||||
154 | "state": "active", | ||||
155 | "url": | ||||
156 | records_with_duplicate_and_each_duplicate_being_repeated_20times.csv", | ||||
157 | "url_type": "upload" | ||||
133 | } | 158 | } | ||
134 | ], | 159 | ], | ||
135 | "services_used_list": "", | 160 | "services_used_list": "", | ||
136 | "state": "active", | 161 | "state": "active", | ||
137 | "tags": [ | 162 | "tags": [ | ||
138 | { | 163 | { | ||
139 | "display_name": "Benchmark", | 164 | "display_name": "Benchmark", | ||
140 | "id": "70474eb4-f8bf-42f1-bf26-7511d4f3356c", | 165 | "id": "70474eb4-f8bf-42f1-bf26-7511d4f3356c", | ||
141 | "name": "Benchmark", | 166 | "name": "Benchmark", | ||
142 | "state": "active", | 167 | "state": "active", | ||
143 | "vocabulary_id": null | 168 | "vocabulary_id": null | ||
144 | }, | 169 | }, | ||
145 | { | 170 | { | ||
146 | "display_name": "health", | 171 | "display_name": "health", | ||
147 | "id": "4ad8e4c3-0431-4ed0-8d43-25db6f543a69", | 172 | "id": "4ad8e4c3-0431-4ed0-8d43-25db6f543a69", | ||
148 | "name": "health", | 173 | "name": "health", | ||
149 | "state": "active", | 174 | "state": "active", | ||
150 | "vocabulary_id": null | 175 | "vocabulary_id": null | ||
151 | } | 176 | } | ||
152 | ], | 177 | ], | ||
153 | "title": "SDM-Genomic-Dataset", | 178 | "title": "SDM-Genomic-Dataset", | ||
154 | "type": "dataset", | 179 | "type": "dataset", | ||
155 | "url": | 180 | "url": | ||
156 | "https://figshare.com/articles/dataset/SDM-Genomic-Datasets/14838342", | 181 | "https://figshare.com/articles/dataset/SDM-Genomic-Datasets/14838342", | ||
157 | "version": "" | 182 | "version": "" | ||
158 | } | 183 | } |