Changes
On August 4, 2023 at 9:04:29 AM UTC, admin:
-
No fields were updated. See the metadata diff for more details.
f | 1 | { | f | 1 | { |
2 | "author": "Bach, Jakob", | 2 | "author": "Bach, Jakob", | ||
3 | "author_email": "", | 3 | "author_email": "", | ||
4 | "creator_user_id": "17755db4-395a-4b3b-ac09-e8e3484ca700", | 4 | "creator_user_id": "17755db4-395a-4b3b-ac09-e8e3484ca700", | ||
5 | "doi": "10.35097/1623", | 5 | "doi": "10.35097/1623", | ||
6 | "doi_date_published": "2023", | 6 | "doi_date_published": "2023", | ||
7 | "doi_publisher": "", | 7 | "doi_publisher": "", | ||
8 | "doi_status": "True", | 8 | "doi_status": "True", | ||
9 | "groups": [], | 9 | "groups": [], | ||
10 | "id": "ef3d25f6-8b6f-4976-a9ea-53db6fdbc36a", | 10 | "id": "ef3d25f6-8b6f-4976-a9ea-53db6fdbc36a", | ||
11 | "isopen": false, | 11 | "isopen": false, | ||
12 | "license_id": "CC BY 4.0 Attribution", | 12 | "license_id": "CC BY 4.0 Attribution", | ||
13 | "license_title": "CC BY 4.0 Attribution", | 13 | "license_title": "CC BY 4.0 Attribution", | ||
14 | "metadata_created": "2023-08-04T08:51:10.655015", | 14 | "metadata_created": "2023-08-04T08:51:10.655015", | ||
t | 15 | "metadata_modified": "2023-08-04T08:53:48.180734", | t | 15 | "metadata_modified": "2023-08-04T09:04:29.771924", |
16 | "name": "rdr-doi-10-35097-1623", | 16 | "name": "rdr-doi-10-35097-1623", | ||
17 | "notes": "Abstract: These are the experimental data for the | 17 | "notes": "Abstract: These are the experimental data for the | ||
18 | paper\r\n\r\n> Bach, Jakob. \"Finding Optimal Diverse Feature Sets | 18 | paper\r\n\r\n> Bach, Jakob. \"Finding Optimal Diverse Feature Sets | ||
19 | with Alternative Feature Selection\"\r\n\r\npublished on | 19 | with Alternative Feature Selection\"\r\n\r\npublished on | ||
20 | [arXiv](https://arxiv.org/) in 2023.\r\nYou can find the paper | 20 | [arXiv](https://arxiv.org/) in 2023.\r\nYou can find the paper | ||
21 | [here](https://doi.org/10.48550/arXiv.2307.11607) and the code | 21 | [here](https://doi.org/10.48550/arXiv.2307.11607) and the code | ||
22 | ](https://github.com/jakob-bach/alternative-feature-selection).\r\nSee | 22 | ](https://github.com/jakob-bach/alternative-feature-selection).\r\nSee | ||
23 | the `README` for details.\r\n\r\nThe datasets used in our study (which | 23 | the `README` for details.\r\n\r\nThe datasets used in our study (which | ||
24 | we also provide here) originate from | 24 | we also provide here) originate from | ||
25 | [PMLB](https://epistasislab.github.io/pmlb/).\r\nThe corresponding | 25 | [PMLB](https://epistasislab.github.io/pmlb/).\r\nThe corresponding | ||
26 | [GitHub repository](https://github.com/EpistasisLab/pmlb) is | 26 | [GitHub repository](https://github.com/EpistasisLab/pmlb) is | ||
27 | MIT-licensed ((c) 2016 Epistasis Lab at UPenn).\r\nPlease see the file | 27 | MIT-licensed ((c) 2016 Epistasis Lab at UPenn).\r\nPlease see the file | ||
28 | `LICENSE` in the folder `datasets/` for the license | 28 | `LICENSE` in the folder `datasets/` for the license | ||
29 | text.\r\nTechnicalRemarks: # Experimental Data for the Paper \"Finding | 29 | text.\r\nTechnicalRemarks: # Experimental Data for the Paper \"Finding | ||
30 | Optimal Diverse Feature Sets with Alternative Feature | 30 | Optimal Diverse Feature Sets with Alternative Feature | ||
31 | Selection\"\r\n\r\nThese are the experimental data for the | 31 | Selection\"\r\n\r\nThese are the experimental data for the | ||
32 | paper\r\n\r\n> Bach, Jakob. \"Finding Optimal Diverse Feature Sets | 32 | paper\r\n\r\n> Bach, Jakob. \"Finding Optimal Diverse Feature Sets | ||
33 | with Alternative Feature Selection\"\r\n\r\npublished at | 33 | with Alternative Feature Selection\"\r\n\r\npublished at | ||
34 | [arXiv](https://arxiv.org/) in 2023.\r\nIf we create multiple versions | 34 | [arXiv](https://arxiv.org/) in 2023.\r\nIf we create multiple versions | ||
35 | of this paper in the future, these experimental data will cover at | 35 | of this paper in the future, these experimental data will cover at | ||
36 | least the first version.\r\n\r\nCheck our [GitHub | 36 | least the first version.\r\n\r\nCheck our [GitHub | ||
37 | pository](https://github.com/Jakob-Bach/Alternative-Feature-Selection) | 37 | pository](https://github.com/Jakob-Bach/Alternative-Feature-Selection) | ||
38 | for the code and instructions to reproduce the experiments.\r\nThe | 38 | for the code and instructions to reproduce the experiments.\r\nThe | ||
39 | data were obtained on a server with an `AMD EPYC 7551` CPU (32 | 39 | data were obtained on a server with an `AMD EPYC 7551` CPU (32 | ||
40 | physical cores, base clock of 2.0 GHz) and 128 GB RAM.\r\nThe | 40 | physical cores, base clock of 2.0 GHz) and 128 GB RAM.\r\nThe | ||
41 | operating system was `Ubuntu 20.04.6 LTS`.\r\nThe Python version was | 41 | operating system was `Ubuntu 20.04.6 LTS`.\r\nThe Python version was | ||
42 | `3.8`.\r\nWith this configuration, running the experimental pipeline | 42 | `3.8`.\r\nWith this configuration, running the experimental pipeline | ||
43 | (`run_experiments.py`) took about 255 h.\r\n\r\nThe commit hash for | 43 | (`run_experiments.py`) took about 255 h.\r\n\r\nThe commit hash for | ||
44 | the last run of the experimental pipeline (`run_experiments.py`) is | 44 | the last run of the experimental pipeline (`run_experiments.py`) is | ||
45 | ature-Selection/tree/2212360a320121862556bc8d54da45cb6dc8ee93).\r\nThe | 45 | ature-Selection/tree/2212360a320121862556bc8d54da45cb6dc8ee93).\r\nThe | ||
46 | commit hash for the last run of the evaluation pipeline | 46 | commit hash for the last run of the evaluation pipeline | ||
47 | (`run_evaluation.py`) is | 47 | (`run_evaluation.py`) is | ||
48 | eature-Selection/tree/487b8ba04d513881be48fb2bc7b1c1dca416d122).\r\nWe | 48 | eature-Selection/tree/487b8ba04d513881be48fb2bc7b1c1dca416d122).\r\nWe | ||
49 | also tagged both commits (`run-2023-06-23` and | 49 | also tagged both commits (`run-2023-06-23` and | ||
50 | `evaluation-2023-07-04`).\r\n\r\nThe experimental data are stored in | 50 | `evaluation-2023-07-04`).\r\n\r\nThe experimental data are stored in | ||
51 | two folders, `datasets/` and `results/`.\r\nFurther, the console | 51 | two folders, `datasets/` and `results/`.\r\nFurther, the console | ||
52 | output of `run_evaluation.py` is stored in | 52 | output of `run_evaluation.py` is stored in | ||
53 | `Evaluation_console_output.txt` (manually copied from the console to a | 53 | `Evaluation_console_output.txt` (manually copied from the console to a | ||
54 | file).\r\nIn the following, we describe the structure and content of | 54 | file).\r\nIn the following, we describe the structure and content of | ||
55 | each data file.\r\n\r\n## `datasets/`\r\n\r\nThese are the input data | 55 | each data file.\r\n\r\n## `datasets/`\r\n\r\nThese are the input data | ||
56 | for the experimental pipeline `run_experiments.py`, i.e., prediction | 56 | for the experimental pipeline `run_experiments.py`, i.e., prediction | ||
57 | datasets.\r\nThe folder contains one overview file, one license file, | 57 | datasets.\r\nThe folder contains one overview file, one license file, | ||
58 | and two files for each of the 30 datasets.\r\n\r\nThe original | 58 | and two files for each of the 30 datasets.\r\n\r\nThe original | ||
59 | datasets were downloaded from | 59 | datasets were downloaded from | ||
60 | [PMLB](https://epistasislab.github.io/pmlb/) with the script | 60 | [PMLB](https://epistasislab.github.io/pmlb/) with the script | ||
61 | `prepare_datasets.py`.\r\nNote that we do not own the copyright for | 61 | `prepare_datasets.py`.\r\nNote that we do not own the copyright for | ||
62 | these datasets.\r\nHowever, the [GitHub repository of | 62 | these datasets.\r\nHowever, the [GitHub repository of | ||
63 | PMLB](https://github.com/EpistasisLab/pmlb), which stores the original | 63 | PMLB](https://github.com/EpistasisLab/pmlb), which stores the original | ||
64 | datasets, is MIT-licensed ((c) 2016 Epistasis Lab at UPenn).\r\nThus, | 64 | datasets, is MIT-licensed ((c) 2016 Epistasis Lab at UPenn).\r\nThus, | ||
65 | we include the file `LICENSE` from that repository.\r\n\r\nAfter | 65 | we include the file `LICENSE` from that repository.\r\n\r\nAfter | ||
66 | downloading from `PMLB`, we split each dataset into the feature part | 66 | downloading from `PMLB`, we split each dataset into the feature part | ||
67 | (`_X.csv`) and the target part (`_y.csv`), which we save | 67 | (`_X.csv`) and the target part (`_y.csv`), which we save | ||
68 | separately.\r\nBoth files are CSVs only containing numeric values | 68 | separately.\r\nBoth files are CSVs only containing numeric values | ||
69 | (categorical features are ordinally encoded in `PMLB`) except for the | 69 | (categorical features are ordinally encoded in `PMLB`) except for the | ||
70 | column names.\r\nThere are no missing values.\r\nEach row corresponds | 70 | column names.\r\nThere are no missing values.\r\nEach row corresponds | ||
71 | to a data object (= instance, sample), and each column corresponds to | 71 | to a data object (= instance, sample), and each column corresponds to | ||
72 | a feature.\r\nThe first line in each CSV contains the names of the | 72 | a feature.\r\nThe first line in each CSV contains the names of the | ||
73 | features as strings; for `_y.csv` files, there is only one column, | 73 | features as strings; for `_y.csv` files, there is only one column, | ||
74 | always named `target`.\r\n\r\n`_dataset_overview.csv` contains | 74 | always named `target`.\r\n\r\n`_dataset_overview.csv` contains | ||
75 | meta-data for the datasets, like the number of instances and | 75 | meta-data for the datasets, like the number of instances and | ||
76 | features.\r\n\r\n## `results/`\r\n\r\nThese are the output data of the | 76 | features.\r\n\r\n## `results/`\r\n\r\nThese are the output data of the | ||
77 | experimental pipeline in the form of CSVs, produced by the script | 77 | experimental pipeline in the form of CSVs, produced by the script | ||
78 | `run_experiments.py`.\r\n`_results.csv` contains all results merged | 78 | `run_experiments.py`.\r\n`_results.csv` contains all results merged | ||
79 | into one file and acts as input for the script | 79 | into one file and acts as input for the script | ||
80 | `run_evaluation.py`.\r\nThe remaining files are subsets of the | 80 | `run_evaluation.py`.\r\nThe remaining files are subsets of the | ||
81 | results, as the experimental pipeline parallelizes over 30 datasets, 5 | 81 | results, as the experimental pipeline parallelizes over 30 datasets, 5 | ||
82 | cross-validation folds, and 5 feature-selection methods.\r\nThus, | 82 | cross-validation folds, and 5 feature-selection methods.\r\nThus, | ||
83 | there are `30 * 5 * 5 = 750` files containing subsets of the | 83 | there are `30 * 5 * 5 = 750` files containing subsets of the | ||
84 | results.\r\n\r\nEach row in a result file corresponds to one feature | 84 | results.\r\n\r\nEach row in a result file corresponds to one feature | ||
85 | set.\r\nOne can identify individual search runs for alternatives with | 85 | set.\r\nOne can identify individual search runs for alternatives with | ||
86 | a combination of multiple columns, i.e.:\r\n\r\n- dataset | 86 | a combination of multiple columns, i.e.:\r\n\r\n- dataset | ||
87 | `dataset_name`\r\n- cross-validation fold `split_idx`\r\n- | 87 | `dataset_name`\r\n- cross-validation fold `split_idx`\r\n- | ||
88 | feature-selection method `fs_name`\r\n- search method | 88 | feature-selection method `fs_name`\r\n- search method | ||
89 | `search_name`\r\n- objective aggregation `objective_agg`\r\n- | 89 | `search_name`\r\n- objective aggregation `objective_agg`\r\n- | ||
90 | feature-set size `k`\r\n- number of alternatives | 90 | feature-set size `k`\r\n- number of alternatives | ||
91 | `num_alternatives`\r\n- dissimilarity threshold `tau_abs`\r\n\r\nThe | 91 | `num_alternatives`\r\n- dissimilarity threshold `tau_abs`\r\n\r\nThe | ||
92 | remaining columns mostly represent evaluation metrics.\r\nIn detail, | 92 | remaining columns mostly represent evaluation metrics.\r\nIn detail, | ||
93 | all result files contain the following columns:\r\n\r\n- | 93 | all result files contain the following columns:\r\n\r\n- | ||
94 | `selected_idxs` (list of ints, e.g., `[0, 4, 5, 6, 8]`): The indices | 94 | `selected_idxs` (list of ints, e.g., `[0, 4, 5, 6, 8]`): The indices | ||
95 | (starting from 0) of the selected features (i.e., columns in the | 95 | (starting from 0) of the selected features (i.e., columns in the | ||
96 | dataset).\r\n Might also be an empty list, i.e., `[]` if no valid | 96 | dataset).\r\n Might also be an empty list, i.e., `[]` if no valid | ||
97 | solution was found.\r\n In that case, the two `_objective` columns | 97 | solution was found.\r\n In that case, the two `_objective` columns | ||
98 | and the four `_mcc` columns contain a missing value (empty | 98 | and the four `_mcc` columns contain a missing value (empty | ||
99 | string).\r\n- `train_objective` (float in `[-1,1]`): The training-set | 99 | string).\r\n- `train_objective` (float in `[-1,1]`): The training-set | ||
100 | objective value of the feature set.\r\n Three methods (FCBF, MI, | 100 | objective value of the feature set.\r\n Three methods (FCBF, MI, | ||
101 | Model Importance) have the range `[0,1]`, while two methods (mRMR, | 101 | Model Importance) have the range `[0,1]`, while two methods (mRMR, | ||
102 | Greedy Wrapper) have the range `[-1,1]`.\r\n- `test_objective` (float | 102 | Greedy Wrapper) have the range `[-1,1]`.\r\n- `test_objective` (float | ||
103 | in `[-1,1]` ): The test-set objective value of the feature set.\r\n- | 103 | in `[-1,1]` ): The test-set objective value of the feature set.\r\n- | ||
104 | `optimization_time` (non-negative float): Time for alternative feature | 104 | `optimization_time` (non-negative float): Time for alternative feature | ||
105 | selection in seconds.\r\n For white-box feature-selection methods, | 105 | selection in seconds.\r\n For white-box feature-selection methods, | ||
106 | this measures one solver call.\r\n In contrast, wrapper feature | 106 | this measures one solver call.\r\n In contrast, wrapper feature | ||
107 | selection can call the solver multiple times, and we record the total | 107 | selection can call the solver multiple times, and we record the total | ||
108 | runtime of the \"Greedy Wrapper\" algorithm.\r\n- | 108 | runtime of the \"Greedy Wrapper\" algorithm.\r\n- | ||
109 | `optimization_status` (int in `{0, 1, 2, 6}`): The status of the | 109 | `optimization_status` (int in `{0, 1, 2, 6}`): The status of the | ||
110 | solver; for wrapper feature selection, this is only the status of the | 110 | solver; for wrapper feature selection, this is only the status of the | ||
111 | last solver call.\r\n - 0 = (proven as) optimal\r\n - 1 = feasible | 111 | last solver call.\r\n - 0 = (proven as) optimal\r\n - 1 = feasible | ||
112 | (valid solution, but might be suboptimal)\r\n - 2 = (proven as) | 112 | (valid solution, but might be suboptimal)\r\n - 2 = (proven as) | ||
113 | infeasible\r\n - 6 = not solved (no valid solution found, but one | 113 | infeasible\r\n - 6 = not solved (no valid solution found, but one | ||
114 | might exist)\r\n- `decision_tree_train_mcc` (float in `[-1,1]`): | 114 | might exist)\r\n- `decision_tree_train_mcc` (float in `[-1,1]`): | ||
115 | Training-set prediction performance (in terms of Matthews Correlation | 115 | Training-set prediction performance (in terms of Matthews Correlation | ||
116 | Coefficient) of a decision tree trained with the selected | 116 | Coefficient) of a decision tree trained with the selected | ||
117 | features.\r\n- `decision_tree_test_mcc` (float in `[-1,1]`): Test-set | 117 | features.\r\n- `decision_tree_test_mcc` (float in `[-1,1]`): Test-set | ||
118 | prediction performance (in terms of Matthews Correlation Coefficient) | 118 | prediction performance (in terms of Matthews Correlation Coefficient) | ||
119 | of a decision tree trained with the selected features.\r\n- | 119 | of a decision tree trained with the selected features.\r\n- | ||
120 | `random_forest_train_mcc` (float in `[-1,1]`): Training-set prediction | 120 | `random_forest_train_mcc` (float in `[-1,1]`): Training-set prediction | ||
121 | performance (in terms of Matthews Correlation Coefficient) of a random | 121 | performance (in terms of Matthews Correlation Coefficient) of a random | ||
122 | forest trained with the selected features.\r\n- | 122 | forest trained with the selected features.\r\n- | ||
123 | `random_forest_test_mcc` (float in `[-1,1]`): Test-set prediction | 123 | `random_forest_test_mcc` (float in `[-1,1]`): Test-set prediction | ||
124 | performance (in terms of Matthews Correlation Coefficient) of a random | 124 | performance (in terms of Matthews Correlation Coefficient) of a random | ||
125 | forest trained with the selected features.\r\n- `k` (int in `{5, | 125 | forest trained with the selected features.\r\n- `k` (int in `{5, | ||
126 | 10}`): The number of features to be selected.\r\n- `tau_abs` (int in | 126 | 10}`): The number of features to be selected.\r\n- `tau_abs` (int in | ||
127 | `{1, ..., 10}`): The dissimilarity threshold for alternatives, | 127 | `{1, ..., 10}`): The dissimilarity threshold for alternatives, | ||
128 | corresponding to the absolute number of features (`k * tau`) that have | 128 | corresponding to the absolute number of features (`k * tau`) that have | ||
129 | to differ between feature sets.\r\n- `num_alternatives` (int in `{1, | 129 | to differ between feature sets.\r\n- `num_alternatives` (int in `{1, | ||
130 | 2, 3, 4, 5, 10}`): The number of desired alternative feature sets, not | 130 | 2, 3, 4, 5, 10}`): The number of desired alternative feature sets, not | ||
131 | counting the original (zeroth) feature set.\r\n A number from `{1, 2, | 131 | counting the original (zeroth) feature set.\r\n A number from `{1, 2, | ||
132 | 3, 4, 5}` for simultaneous search and always `10` for sequential | 132 | 3, 4, 5}` for simultaneous search and always `10` for sequential | ||
133 | search.\r\n- `objective_agg` (string, 2 different values): The name of | 133 | search.\r\n- `objective_agg` (string, 2 different values): The name of | ||
134 | the quality-aggregation function for alternatives (`min` or | 134 | the quality-aggregation function for alternatives (`min` or | ||
135 | `sum`).\r\n Min-aggregation or sum-aggregation for simultaneous | 135 | `sum`).\r\n Min-aggregation or sum-aggregation for simultaneous | ||
136 | search but always sum-aggregation for sequential search.\r\n (In | 136 | search but always sum-aggregation for sequential search.\r\n (In | ||
137 | fact, sequential search only optimizes individual feature sets anyway, | 137 | fact, sequential search only optimizes individual feature sets anyway, | ||
138 | so the aggregation does not matter for the search.)\r\n- `search_name` | 138 | so the aggregation does not matter for the search.)\r\n- `search_name` | ||
139 | (string, 2 different values): The name of the search method for | 139 | (string, 2 different values): The name of the search method for | ||
140 | alternatives (`search_sequentially` or `search_simultaneously`).\r\n- | 140 | alternatives (`search_sequentially` or `search_simultaneously`).\r\n- | ||
141 | `fs_name` (string, 5 different values): The name of the | 141 | `fs_name` (string, 5 different values): The name of the | ||
142 | feature-selection method (`FCBFSelector`, `MISelector`, | 142 | feature-selection method (`FCBFSelector`, `MISelector`, | ||
143 | `ModelImportanceSelector` (= Model Gain), `MRMRSelector`, or | 143 | `ModelImportanceSelector` (= Model Gain), `MRMRSelector`, or | ||
144 | `GreedyWrapperSelector`).\r\n- `dataset_name` (string, 30 different | 144 | `GreedyWrapperSelector`).\r\n- `dataset_name` (string, 30 different | ||
145 | values): The name of the `PMLB` dataset.\r\n- `n` (positive int): The | 145 | values): The name of the `PMLB` dataset.\r\n- `n` (positive int): The | ||
146 | number of features of the `PMLB` dataset.\r\n- `split_idx` (int in | 146 | number of features of the `PMLB` dataset.\r\n- `split_idx` (int in | ||
147 | `[0,4]`): The index of the cross-validation fold.\r\n- `wrapper_iters` | 147 | `[0,4]`): The index of the cross-validation fold.\r\n- `wrapper_iters` | ||
148 | (int in `[1,1000]`): The number of iterations in case wrapper feature | 148 | (int in `[1,1000]`): The number of iterations in case wrapper feature | ||
149 | selection was used, missing value (empty string) in the other | 149 | selection was used, missing value (empty string) in the other | ||
150 | cases.\r\n This column does not exist in result files not containing | 150 | cases.\r\n This column does not exist in result files not containing | ||
151 | wrapper results.\r\n\r\nYou can easily read in any of the result files | 151 | wrapper results.\r\n\r\nYou can easily read in any of the result files | ||
152 | with `pandas`:\r\n\r\n```python\r\nimport pandas as pd\r\n\r\nresults | 152 | with `pandas`:\r\n\r\n```python\r\nimport pandas as pd\r\n\r\nresults | ||
153 | = pd.read_csv('results/_results.csv')\r\n```\r\n\r\nAll result files | 153 | = pd.read_csv('results/_results.csv')\r\n```\r\n\r\nAll result files | ||
154 | are comma-separated and contain plain numbers and unquoted strings, | 154 | are comma-separated and contain plain numbers and unquoted strings, | ||
155 | apart from the column `selected_features` (which is quoted and | 155 | apart from the column `selected_features` (which is quoted and | ||
156 | represents lists of integers).\r\nThe first line in each result file | 156 | represents lists of integers).\r\nThe first line in each result file | ||
157 | contains the column names.\r\nYou can use the following code to make | 157 | contains the column names.\r\nYou can use the following code to make | ||
158 | sure that lists of feature indices are treated as such (rather than | 158 | sure that lists of feature indices are treated as such (rather than | ||
159 | strings):\r\n\r\n```python\r\nimport | 159 | strings):\r\n\r\n```python\r\nimport | ||
160 | ast\r\n\r\nresults['selected_idxs'] = | 160 | ast\r\n\r\nresults['selected_idxs'] = | ||
161 | results['selected_idxs'].apply(ast.literal_eval)\r\n```", | 161 | results['selected_idxs'].apply(ast.literal_eval)\r\n```", | ||
162 | "num_resources": 0, | 162 | "num_resources": 0, | ||
163 | "num_tags": 7, | 163 | "num_tags": 7, | ||
164 | "orcid": "# Experimental Data for the Paper \"Finding Optimal | 164 | "orcid": "# Experimental Data for the Paper \"Finding Optimal | ||
165 | Diverse Feature Sets with Alternative Feature Selection\"\r\n\r\nThese | 165 | Diverse Feature Sets with Alternative Feature Selection\"\r\n\r\nThese | ||
166 | are the experimental data for the paper\r\n\r\n> Bach, Jakob. | 166 | are the experimental data for the paper\r\n\r\n> Bach, Jakob. | ||
167 | \"Finding Optimal Diverse Feature Sets with Alternative Feature | 167 | \"Finding Optimal Diverse Feature Sets with Alternative Feature | ||
168 | Selection\"\r\n\r\npublished at [arXiv](https://arxiv.org/) in | 168 | Selection\"\r\n\r\npublished at [arXiv](https://arxiv.org/) in | ||
169 | 2023.\r\nIf we create multiple versions of this paper in the future, | 169 | 2023.\r\nIf we create multiple versions of this paper in the future, | ||
170 | these experimental data will cover at least the first | 170 | these experimental data will cover at least the first | ||
171 | version.\r\n\r\nCheck our [GitHub | 171 | version.\r\n\r\nCheck our [GitHub | ||
172 | pository](https://github.com/Jakob-Bach/Alternative-Feature-Selection) | 172 | pository](https://github.com/Jakob-Bach/Alternative-Feature-Selection) | ||
173 | for the code and instructions to reproduce the experiments.\r\nThe | 173 | for the code and instructions to reproduce the experiments.\r\nThe | ||
174 | data were obtained on a server with an `AMD EPYC 7551` CPU (32 | 174 | data were obtained on a server with an `AMD EPYC 7551` CPU (32 | ||
175 | physical cores, base clock of 2.0 GHz) and 128 GB RAM.\r\nThe | 175 | physical cores, base clock of 2.0 GHz) and 128 GB RAM.\r\nThe | ||
176 | operating system was `Ubuntu 20.04.6 LTS`.\r\nThe Python version was | 176 | operating system was `Ubuntu 20.04.6 LTS`.\r\nThe Python version was | ||
177 | `3.8`.\r\nWith this configuration, running the experimental pipeline | 177 | `3.8`.\r\nWith this configuration, running the experimental pipeline | ||
178 | (`run_experiments.py`) took about 255 h.\r\n\r\nThe commit hash for | 178 | (`run_experiments.py`) took about 255 h.\r\n\r\nThe commit hash for | ||
179 | the last run of the experimental pipeline (`run_experiments.py`) is | 179 | the last run of the experimental pipeline (`run_experiments.py`) is | ||
180 | ature-Selection/tree/2212360a320121862556bc8d54da45cb6dc8ee93).\r\nThe | 180 | ature-Selection/tree/2212360a320121862556bc8d54da45cb6dc8ee93).\r\nThe | ||
181 | commit hash for the last run of the evaluation pipeline | 181 | commit hash for the last run of the evaluation pipeline | ||
182 | (`run_evaluation.py`) is | 182 | (`run_evaluation.py`) is | ||
183 | eature-Selection/tree/487b8ba04d513881be48fb2bc7b1c1dca416d122).\r\nWe | 183 | eature-Selection/tree/487b8ba04d513881be48fb2bc7b1c1dca416d122).\r\nWe | ||
184 | also tagged both commits (`run-2023-06-23` and | 184 | also tagged both commits (`run-2023-06-23` and | ||
185 | `evaluation-2023-07-04`).\r\n\r\nThe experimental data are stored in | 185 | `evaluation-2023-07-04`).\r\n\r\nThe experimental data are stored in | ||
186 | two folders, `datasets/` and `results/`.\r\nFurther, the console | 186 | two folders, `datasets/` and `results/`.\r\nFurther, the console | ||
187 | output of `run_evaluation.py` is stored in | 187 | output of `run_evaluation.py` is stored in | ||
188 | `Evaluation_console_output.txt` (manually copied from the console to a | 188 | `Evaluation_console_output.txt` (manually copied from the console to a | ||
189 | file).\r\nIn the following, we describe the structure and content of | 189 | file).\r\nIn the following, we describe the structure and content of | ||
190 | each data file.\r\n\r\n## `datasets/`\r\n\r\nThese are the input data | 190 | each data file.\r\n\r\n## `datasets/`\r\n\r\nThese are the input data | ||
191 | for the experimental pipeline `run_experiments.py`, i.e., prediction | 191 | for the experimental pipeline `run_experiments.py`, i.e., prediction | ||
192 | datasets.\r\nThe folder contains one overview file, one license file, | 192 | datasets.\r\nThe folder contains one overview file, one license file, | ||
193 | and two files for each of the 30 datasets.\r\n\r\nThe original | 193 | and two files for each of the 30 datasets.\r\n\r\nThe original | ||
194 | datasets were downloaded from | 194 | datasets were downloaded from | ||
195 | [PMLB](https://epistasislab.github.io/pmlb/) with the script | 195 | [PMLB](https://epistasislab.github.io/pmlb/) with the script | ||
196 | `prepare_datasets.py`.\r\nNote that we do not own the copyright for | 196 | `prepare_datasets.py`.\r\nNote that we do not own the copyright for | ||
197 | these datasets.\r\nHowever, the [GitHub repository of | 197 | these datasets.\r\nHowever, the [GitHub repository of | ||
198 | PMLB](https://github.com/EpistasisLab/pmlb), which stores the original | 198 | PMLB](https://github.com/EpistasisLab/pmlb), which stores the original | ||
199 | datasets, is MIT-licensed ((c) 2016 Epistasis Lab at UPenn).\r\nThus, | 199 | datasets, is MIT-licensed ((c) 2016 Epistasis Lab at UPenn).\r\nThus, | ||
200 | we include the file `LICENSE` from that repository.\r\n\r\nAfter | 200 | we include the file `LICENSE` from that repository.\r\n\r\nAfter | ||
201 | downloading from `PMLB`, we split each dataset into the feature part | 201 | downloading from `PMLB`, we split each dataset into the feature part | ||
202 | (`_X.csv`) and the target part (`_y.csv`), which we save | 202 | (`_X.csv`) and the target part (`_y.csv`), which we save | ||
203 | separately.\r\nBoth files are CSVs only containing numeric values | 203 | separately.\r\nBoth files are CSVs only containing numeric values | ||
204 | (categorical features are ordinally encoded in `PMLB`) except for the | 204 | (categorical features are ordinally encoded in `PMLB`) except for the | ||
205 | column names.\r\nThere are no missing values.\r\nEach row corresponds | 205 | column names.\r\nThere are no missing values.\r\nEach row corresponds | ||
206 | to a data object (= instance, sample), and each column corresponds to | 206 | to a data object (= instance, sample), and each column corresponds to | ||
207 | a feature.\r\nThe first line in each CSV contains the names of the | 207 | a feature.\r\nThe first line in each CSV contains the names of the | ||
208 | features as strings; for `_y.csv` files, there is only one column, | 208 | features as strings; for `_y.csv` files, there is only one column, | ||
209 | always named `target`.\r\n\r\n`_dataset_overview.csv` contains | 209 | always named `target`.\r\n\r\n`_dataset_overview.csv` contains | ||
210 | meta-data for the datasets, like the number of instances and | 210 | meta-data for the datasets, like the number of instances and | ||
211 | features.\r\n\r\n## `results/`\r\n\r\nThese are the output data of the | 211 | features.\r\n\r\n## `results/`\r\n\r\nThese are the output data of the | ||
212 | experimental pipeline in the form of CSVs, produced by the script | 212 | experimental pipeline in the form of CSVs, produced by the script | ||
213 | `run_experiments.py`.\r\n`_results.csv` contains all results merged | 213 | `run_experiments.py`.\r\n`_results.csv` contains all results merged | ||
214 | into one file and acts as input for the script | 214 | into one file and acts as input for the script | ||
215 | `run_evaluation.py`.\r\nThe remaining files are subsets of the | 215 | `run_evaluation.py`.\r\nThe remaining files are subsets of the | ||
216 | results, as the experimental pipeline parallelizes over 30 datasets, 5 | 216 | results, as the experimental pipeline parallelizes over 30 datasets, 5 | ||
217 | cross-validation folds, and 5 feature-selection methods.\r\nThus, | 217 | cross-validation folds, and 5 feature-selection methods.\r\nThus, | ||
218 | there are `30 * 5 * 5 = 750` files containing subsets of the | 218 | there are `30 * 5 * 5 = 750` files containing subsets of the | ||
219 | results.\r\n\r\nEach row in a result file corresponds to one feature | 219 | results.\r\n\r\nEach row in a result file corresponds to one feature | ||
220 | set.\r\nOne can identify individual search runs for alternatives with | 220 | set.\r\nOne can identify individual search runs for alternatives with | ||
221 | a combination of multiple columns, i.e.:\r\n\r\n- dataset | 221 | a combination of multiple columns, i.e.:\r\n\r\n- dataset | ||
222 | `dataset_name`\r\n- cross-validation fold `split_idx`\r\n- | 222 | `dataset_name`\r\n- cross-validation fold `split_idx`\r\n- | ||
223 | feature-selection method `fs_name`\r\n- search method | 223 | feature-selection method `fs_name`\r\n- search method | ||
224 | `search_name`\r\n- objective aggregation `objective_agg`\r\n- | 224 | `search_name`\r\n- objective aggregation `objective_agg`\r\n- | ||
225 | feature-set size `k`\r\n- number of alternatives | 225 | feature-set size `k`\r\n- number of alternatives | ||
226 | `num_alternatives`\r\n- dissimilarity threshold `tau_abs`\r\n\r\nThe | 226 | `num_alternatives`\r\n- dissimilarity threshold `tau_abs`\r\n\r\nThe | ||
227 | remaining columns mostly represent evaluation metrics.\r\nIn detail, | 227 | remaining columns mostly represent evaluation metrics.\r\nIn detail, | ||
228 | all result files contain the following columns:\r\n\r\n- | 228 | all result files contain the following columns:\r\n\r\n- | ||
229 | `selected_idxs` (list of ints, e.g., `[0, 4, 5, 6, 8]`): The indices | 229 | `selected_idxs` (list of ints, e.g., `[0, 4, 5, 6, 8]`): The indices | ||
230 | (starting from 0) of the selected features (i.e., columns in the | 230 | (starting from 0) of the selected features (i.e., columns in the | ||
231 | dataset).\r\n Might also be an empty list, i.e., `[]` if no valid | 231 | dataset).\r\n Might also be an empty list, i.e., `[]` if no valid | ||
232 | solution was found.\r\n In that case, the two `_objective` columns | 232 | solution was found.\r\n In that case, the two `_objective` columns | ||
233 | and the four `_mcc` columns contain a missing value (empty | 233 | and the four `_mcc` columns contain a missing value (empty | ||
234 | string).\r\n- `train_objective` (float in `[-1,1]`): The training-set | 234 | string).\r\n- `train_objective` (float in `[-1,1]`): The training-set | ||
235 | objective value of the feature set.\r\n Three methods (FCBF, MI, | 235 | objective value of the feature set.\r\n Three methods (FCBF, MI, | ||
236 | Model Importance) have the range `[0,1]`, while two methods (mRMR, | 236 | Model Importance) have the range `[0,1]`, while two methods (mRMR, | ||
237 | Greedy Wrapper) have the range `[-1,1]`.\r\n- `test_objective` (float | 237 | Greedy Wrapper) have the range `[-1,1]`.\r\n- `test_objective` (float | ||
238 | in `[-1,1]` ): The test-set objective value of the feature set.\r\n- | 238 | in `[-1,1]` ): The test-set objective value of the feature set.\r\n- | ||
239 | `optimization_time` (non-negative float): Time for alternative feature | 239 | `optimization_time` (non-negative float): Time for alternative feature | ||
240 | selection in seconds.\r\n For white-box feature-selection methods, | 240 | selection in seconds.\r\n For white-box feature-selection methods, | ||
241 | this measures one solver call.\r\n In contrast, wrapper feature | 241 | this measures one solver call.\r\n In contrast, wrapper feature | ||
242 | selection can call the solver multiple times, and we record the total | 242 | selection can call the solver multiple times, and we record the total | ||
243 | runtime of the \"Greedy Wrapper\" algorithm.\r\n- | 243 | runtime of the \"Greedy Wrapper\" algorithm.\r\n- | ||
244 | `optimization_status` (int in `{0, 1, 2, 6}`): The status of the | 244 | `optimization_status` (int in `{0, 1, 2, 6}`): The status of the | ||
245 | solver; for wrapper feature selection, this is only the status of the | 245 | solver; for wrapper feature selection, this is only the status of the | ||
246 | last solver call.\r\n - 0 = (proven as) optimal\r\n - 1 = feasible | 246 | last solver call.\r\n - 0 = (proven as) optimal\r\n - 1 = feasible | ||
247 | (valid solution, but might be suboptimal)\r\n - 2 = (proven as) | 247 | (valid solution, but might be suboptimal)\r\n - 2 = (proven as) | ||
248 | infeasible\r\n - 6 = not solved (no valid solution found, but one | 248 | infeasible\r\n - 6 = not solved (no valid solution found, but one | ||
249 | might exist)\r\n- `decision_tree_train_mcc` (float in `[-1,1]`): | 249 | might exist)\r\n- `decision_tree_train_mcc` (float in `[-1,1]`): | ||
250 | Training-set prediction performance (in terms of Matthews Correlation | 250 | Training-set prediction performance (in terms of Matthews Correlation | ||
251 | Coefficient) of a decision tree trained with the selected | 251 | Coefficient) of a decision tree trained with the selected | ||
252 | features.\r\n- `decision_tree_test_mcc` (float in `[-1,1]`): Test-set | 252 | features.\r\n- `decision_tree_test_mcc` (float in `[-1,1]`): Test-set | ||
253 | prediction performance (in terms of Matthews Correlation Coefficient) | 253 | prediction performance (in terms of Matthews Correlation Coefficient) | ||
254 | of a decision tree trained with the selected features.\r\n- | 254 | of a decision tree trained with the selected features.\r\n- | ||
255 | `random_forest_train_mcc` (float in `[-1,1]`): Training-set prediction | 255 | `random_forest_train_mcc` (float in `[-1,1]`): Training-set prediction | ||
256 | performance (in terms of Matthews Correlation Coefficient) of a random | 256 | performance (in terms of Matthews Correlation Coefficient) of a random | ||
257 | forest trained with the selected features.\r\n- | 257 | forest trained with the selected features.\r\n- | ||
258 | `random_forest_test_mcc` (float in `[-1,1]`): Test-set prediction | 258 | `random_forest_test_mcc` (float in `[-1,1]`): Test-set prediction | ||
259 | performance (in terms of Matthews Correlation Coefficient) of a random | 259 | performance (in terms of Matthews Correlation Coefficient) of a random | ||
260 | forest trained with the selected features.\r\n- `k` (int in `{5, | 260 | forest trained with the selected features.\r\n- `k` (int in `{5, | ||
261 | 10}`): The number of features to be selected.\r\n- `tau_abs` (int in | 261 | 10}`): The number of features to be selected.\r\n- `tau_abs` (int in | ||
262 | `{1, ..., 10}`): The dissimilarity threshold for alternatives, | 262 | `{1, ..., 10}`): The dissimilarity threshold for alternatives, | ||
263 | corresponding to the absolute number of features (`k * tau`) that have | 263 | corresponding to the absolute number of features (`k * tau`) that have | ||
264 | to differ between feature sets.\r\n- `num_alternatives` (int in `{1, | 264 | to differ between feature sets.\r\n- `num_alternatives` (int in `{1, | ||
265 | 2, 3, 4, 5, 10}`): The number of desired alternative feature sets, not | 265 | 2, 3, 4, 5, 10}`): The number of desired alternative feature sets, not | ||
266 | counting the original (zeroth) feature set.\r\n A number from `{1, 2, | 266 | counting the original (zeroth) feature set.\r\n A number from `{1, 2, | ||
267 | 3, 4, 5}` for simultaneous search and always `10` for sequential | 267 | 3, 4, 5}` for simultaneous search and always `10` for sequential | ||
268 | search.\r\n- `objective_agg` (string, 2 different values): The name of | 268 | search.\r\n- `objective_agg` (string, 2 different values): The name of | ||
269 | the quality-aggregation function for alternatives (`min` or | 269 | the quality-aggregation function for alternatives (`min` or | ||
270 | `sum`).\r\n Min-aggregation or sum-aggregation for simultaneous | 270 | `sum`).\r\n Min-aggregation or sum-aggregation for simultaneous | ||
271 | search but always sum-aggregation for sequential search.\r\n (In | 271 | search but always sum-aggregation for sequential search.\r\n (In | ||
272 | fact, sequential search only optimizes individual feature sets anyway, | 272 | fact, sequential search only optimizes individual feature sets anyway, | ||
273 | so the aggregation does not matter for the search.)\r\n- `search_name` | 273 | so the aggregation does not matter for the search.)\r\n- `search_name` | ||
274 | (string, 2 different values): The name of the search method for | 274 | (string, 2 different values): The name of the search method for | ||
275 | alternatives (`search_sequentially` or `search_simultaneously`).\r\n- | 275 | alternatives (`search_sequentially` or `search_simultaneously`).\r\n- | ||
276 | `fs_name` (string, 5 different values): The name of the | 276 | `fs_name` (string, 5 different values): The name of the | ||
277 | feature-selection method (`FCBFSelector`, `MISelector`, | 277 | feature-selection method (`FCBFSelector`, `MISelector`, | ||
278 | `ModelImportanceSelector` (= Model Gain), `MRMRSelector`, or | 278 | `ModelImportanceSelector` (= Model Gain), `MRMRSelector`, or | ||
279 | `GreedyWrapperSelector`).\r\n- `dataset_name` (string, 30 different | 279 | `GreedyWrapperSelector`).\r\n- `dataset_name` (string, 30 different | ||
280 | values): The name of the `PMLB` dataset.\r\n- `n` (positive int): The | 280 | values): The name of the `PMLB` dataset.\r\n- `n` (positive int): The | ||
281 | number of features of the `PMLB` dataset.\r\n- `split_idx` (int in | 281 | number of features of the `PMLB` dataset.\r\n- `split_idx` (int in | ||
282 | `[0,4]`): The index of the cross-validation fold.\r\n- `wrapper_iters` | 282 | `[0,4]`): The index of the cross-validation fold.\r\n- `wrapper_iters` | ||
283 | (int in `[1,1000]`): The number of iterations in case wrapper feature | 283 | (int in `[1,1000]`): The number of iterations in case wrapper feature | ||
284 | selection was used, missing value (empty string) in the other | 284 | selection was used, missing value (empty string) in the other | ||
285 | cases.\r\n This column does not exist in result files not containing | 285 | cases.\r\n This column does not exist in result files not containing | ||
286 | wrapper results.\r\n\r\nYou can easily read in any of the result files | 286 | wrapper results.\r\n\r\nYou can easily read in any of the result files | ||
287 | with `pandas`:\r\n\r\n```python\r\nimport pandas as pd\r\n\r\nresults | 287 | with `pandas`:\r\n\r\n```python\r\nimport pandas as pd\r\n\r\nresults | ||
288 | = pd.read_csv('results/_results.csv')\r\n```\r\n\r\nAll result files | 288 | = pd.read_csv('results/_results.csv')\r\n```\r\n\r\nAll result files | ||
289 | are comma-separated and contain plain numbers and unquoted strings, | 289 | are comma-separated and contain plain numbers and unquoted strings, | ||
290 | apart from the column `selected_features` (which is quoted and | 290 | apart from the column `selected_features` (which is quoted and | ||
291 | represents lists of integers).\r\nThe first line in each result file | 291 | represents lists of integers).\r\nThe first line in each result file | ||
292 | contains the column names.\r\nYou can use the following code to make | 292 | contains the column names.\r\nYou can use the following code to make | ||
293 | sure that lists of feature indices are treated as such (rather than | 293 | sure that lists of feature indices are treated as such (rather than | ||
294 | strings):\r\n\r\n```python\r\nimport | 294 | strings):\r\n\r\n```python\r\nimport | ||
295 | ast\r\n\r\nresults['selected_idxs'] = | 295 | ast\r\n\r\nresults['selected_idxs'] = | ||
296 | results['selected_idxs'].apply(ast.literal_eval)\r\n```", | 296 | results['selected_idxs'].apply(ast.literal_eval)\r\n```", | ||
297 | "organization": { | 297 | "organization": { | ||
298 | "approval_status": "approved", | 298 | "approval_status": "approved", | ||
299 | "created": "2023-01-12T13:30:23.238233", | 299 | "created": "2023-01-12T13:30:23.238233", | ||
300 | "description": "RADAR (Research Data Repository) is a | 300 | "description": "RADAR (Research Data Repository) is a | ||
301 | cross-disciplinary repository for archiving and publishing research | 301 | cross-disciplinary repository for archiving and publishing research | ||
302 | data from completed scientific studies and projects. The focus is on | 302 | data from completed scientific studies and projects. The focus is on | ||
303 | research data from subjects that do not yet have their own | 303 | research data from subjects that do not yet have their own | ||
304 | discipline-specific infrastructures for research data management. ", | 304 | discipline-specific infrastructures for research data management. ", | ||
305 | "id": "013c89a9-383c-4200-8baa-0f78bf1d91f9", | 305 | "id": "013c89a9-383c-4200-8baa-0f78bf1d91f9", | ||
306 | "image_url": "radar-logo.svg", | 306 | "image_url": "radar-logo.svg", | ||
307 | "is_organization": true, | 307 | "is_organization": true, | ||
308 | "name": "radar", | 308 | "name": "radar", | ||
309 | "state": "active", | 309 | "state": "active", | ||
310 | "title": "RADAR", | 310 | "title": "RADAR", | ||
311 | "type": "organization" | 311 | "type": "organization" | ||
312 | }, | 312 | }, | ||
313 | "owner_org": "013c89a9-383c-4200-8baa-0f78bf1d91f9", | 313 | "owner_org": "013c89a9-383c-4200-8baa-0f78bf1d91f9", | ||
314 | "private": false, | 314 | "private": false, | ||
315 | "production_year": "2023", | 315 | "production_year": "2023", | ||
316 | "publication_year": "2023", | 316 | "publication_year": "2023", | ||
317 | "publishers": [ | 317 | "publishers": [ | ||
318 | { | 318 | { | ||
319 | "publisher": "Karlsruhe Institute of Technology" | 319 | "publisher": "Karlsruhe Institute of Technology" | ||
320 | } | 320 | } | ||
321 | ], | 321 | ], | ||
322 | "relationships_as_object": [], | 322 | "relationships_as_object": [], | ||
323 | "relationships_as_subject": [], | 323 | "relationships_as_subject": [], | ||
324 | "repository_name": "RADAR (Research Data Repository)", | 324 | "repository_name": "RADAR (Research Data Repository)", | ||
325 | "resources": [], | 325 | "resources": [], | ||
326 | "services_used_list": "", | 326 | "services_used_list": "", | ||
327 | "source_metadata_created": "2023", | 327 | "source_metadata_created": "2023", | ||
328 | "source_metadata_modified": "", | 328 | "source_metadata_modified": "", | ||
329 | "state": "active", | 329 | "state": "active", | ||
330 | "subject_areas": [ | 330 | "subject_areas": [ | ||
331 | { | 331 | { | ||
332 | "subject_area_additional": "", | 332 | "subject_area_additional": "", | ||
333 | "subject_area_name": "Computer Science" | 333 | "subject_area_name": "Computer Science" | ||
334 | } | 334 | } | ||
335 | ], | 335 | ], | ||
336 | "tags": [ | 336 | "tags": [ | ||
337 | { | 337 | { | ||
338 | "display_name": "XAI", | 338 | "display_name": "XAI", | ||
339 | "id": "43355d15-2c6a-461c-a316-59db44d3772e", | 339 | "id": "43355d15-2c6a-461c-a316-59db44d3772e", | ||
340 | "name": "XAI", | 340 | "name": "XAI", | ||
341 | "state": "active", | 341 | "state": "active", | ||
342 | "vocabulary_id": null | 342 | "vocabulary_id": null | ||
343 | }, | 343 | }, | ||
344 | { | 344 | { | ||
345 | "display_name": "alternatives", | 345 | "display_name": "alternatives", | ||
346 | "id": "9a9c5c70-6866-43ee-ac05-c188fbe85b96", | 346 | "id": "9a9c5c70-6866-43ee-ac05-c188fbe85b96", | ||
347 | "name": "alternatives", | 347 | "name": "alternatives", | ||
348 | "state": "active", | 348 | "state": "active", | ||
349 | "vocabulary_id": null | 349 | "vocabulary_id": null | ||
350 | }, | 350 | }, | ||
351 | { | 351 | { | ||
352 | "display_name": "constraints", | 352 | "display_name": "constraints", | ||
353 | "id": "41e92e4d-cf4e-4a7e-ac40-b31e436617d7", | 353 | "id": "41e92e4d-cf4e-4a7e-ac40-b31e436617d7", | ||
354 | "name": "constraints", | 354 | "name": "constraints", | ||
355 | "state": "active", | 355 | "state": "active", | ||
356 | "vocabulary_id": null | 356 | "vocabulary_id": null | ||
357 | }, | 357 | }, | ||
358 | { | 358 | { | ||
359 | "display_name": "explainability", | 359 | "display_name": "explainability", | ||
360 | "id": "689f82db-f998-41ef-ae4b-88d9528b2431", | 360 | "id": "689f82db-f998-41ef-ae4b-88d9528b2431", | ||
361 | "name": "explainability", | 361 | "name": "explainability", | ||
362 | "state": "active", | 362 | "state": "active", | ||
363 | "vocabulary_id": null | 363 | "vocabulary_id": null | ||
364 | }, | 364 | }, | ||
365 | { | 365 | { | ||
366 | "display_name": "feature selection", | 366 | "display_name": "feature selection", | ||
367 | "id": "d52fade1-2240-4e3a-9d12-6301e9a2a69f", | 367 | "id": "d52fade1-2240-4e3a-9d12-6301e9a2a69f", | ||
368 | "name": "feature selection", | 368 | "name": "feature selection", | ||
369 | "state": "active", | 369 | "state": "active", | ||
370 | "vocabulary_id": null | 370 | "vocabulary_id": null | ||
371 | }, | 371 | }, | ||
372 | { | 372 | { | ||
373 | "display_name": "interpretability", | 373 | "display_name": "interpretability", | ||
374 | "id": "7003db81-bd53-431c-b2e0-75280f8b4ede", | 374 | "id": "7003db81-bd53-431c-b2e0-75280f8b4ede", | ||
375 | "name": "interpretability", | 375 | "name": "interpretability", | ||
376 | "state": "active", | 376 | "state": "active", | ||
377 | "vocabulary_id": null | 377 | "vocabulary_id": null | ||
378 | }, | 378 | }, | ||
379 | { | 379 | { | ||
380 | "display_name": "mixed-integer programming", | 380 | "display_name": "mixed-integer programming", | ||
381 | "id": "441fc717-84e4-4afa-a64b-de1c16ffcf03", | 381 | "id": "441fc717-84e4-4afa-a64b-de1c16ffcf03", | ||
382 | "name": "mixed-integer programming", | 382 | "name": "mixed-integer programming", | ||
383 | "state": "active", | 383 | "state": "active", | ||
384 | "vocabulary_id": null | 384 | "vocabulary_id": null | ||
385 | } | 385 | } | ||
386 | ], | 386 | ], | ||
387 | "title": "Experimental data for the paper \"finding optimal diverse | 387 | "title": "Experimental data for the paper \"finding optimal diverse | ||
388 | feature sets with alternative feature selection\"", | 388 | feature sets with alternative feature selection\"", | ||
389 | "type": "vdataset", | 389 | "type": "vdataset", | ||
390 | "url": "https://doi.org/10.35097/1623" | 390 | "url": "https://doi.org/10.35097/1623" | ||
391 | } | 391 | } |