Skip to content

Commit b079c37

Browse files
Merge pull request #1002 from openml/develop
Develop
2 parents 1120db0 + 50d9b0f commit b079c37

21 files changed

Lines changed: 397 additions & 238 deletions

data/sql/estimation_procedure.sql

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,5 @@ INSERT INTO `estimation_procedure` (`id`, `ttid`, `name`, `type`, `repeats`, `fo
2626
(25, 1, '4-fold Crossvalidation', 'crossvalidation', 1, 4, 'false', NULL, 'true', 'false', '2016-03-15 13:32:10'),
2727
(26, 1, 'Test on Training Data', 'testontrainingdata', NULL, NULL, 'false', NULL, NULL, 'false', '2019-03-16 11:30:14'),
2828
(27, 2, 'Test on Training Data', 'testontrainingdata', NULL, NULL, 'false', NULL, NULL, 'false', '2019-03-16 11:30:14'),
29-
(28, 1, '20% Holdout (Ordered)', 'holdout_ordered', 1, 1, 'false', 20, NULL, 'false', '2019-05-23 12:40:53');
29+
(28, 1, '20% Holdout (Ordered)', 'holdout_ordered', 1, 1, 'false', 20, NULL, 'false', '2019-05-23 12:40:53'),
30+
(29, 9, '10-fold Crossvalidation', 'crossvalidation', 1, 10, 'false', NULL, 'true', 'false', '2014-12-31 20:00:00');

data/sql/math_function.sql

Lines changed: 74 additions & 74 deletions
Large diffs are not rendered by default.

data/sql/task_type.sql

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ INSERT INTO `task_type` (`ttid`, `name`, `description`, `creator`, `contributors
33
(2, 'Supervised Regression', 'Given a dataset with a numeric target and a set of train/test splits, e.g. generated by a cross-validation procedure, train a model and return the predictions of that model.', 'Joaquin Vanschoren, Jan van Rijn, Luis Torgo, Bernd Bischl', 'Bo Gao, Simon Fischer, Venkatesh Umaashankar, Michael Berthold, Bernd Wiswedel ,Patrick Winter', '2013-02-13 00:00:00'),
44
(3, 'Learning Curve', 'Given a dataset with a nominal target, various data samples of increasing size are defined. A model is build for each individual data sample; from this a learning curve can be drawn. ', 'Pavel Brazdil, Jan van Rijn, Joaquin Vanschoren', NULL, '2014-01-21 00:00:00'),
55
(4, 'Supervised Data Stream Classification', 'Given a dataset with a nominal target, various data samples of increasing size are defined. A model is build for each individual data sample; from this a learning curve can be drawn.', 'Geoffrey Holmes, Bernhard Pfahringer, Jan van Rijn, Joaquin Vanschoren', NULL, '2014-03-01 00:00:00'),
6-
(5, 'Clustering', 'Given an input dataset, the task is to partition it into various clusters.', '"Mehdi Jamali", "Jan van Rijn", "Nenad Tomasev", "Joaquin Vanschoren"', NULL, '2014-10-24 00:00:00'),
7-
(6, 'Machine Learning Challenge', 'This is a standard machine learning challenge with a hidden private dataset.\r\nIt offers a labeled training set and an unlabeled test set. \r\n\r\nThe task is to label the unlabeled instances. Only the OpenML server knows the correct labels, and will evaluate the submitted predictions using these hidden labels. The evaluation procedure, measure, and cost function (if any) are provided.', '"Jan van Rijn","Joaquin Vanschoren"', NULL, '2014-11-28 00:00:00'),
8-
(7, 'Survival Analysis', 'Related to Regression. Given a dataset (typically consisting of patient data) predict a left timestamp (date entering the study), right timestamp (date of leaving the study), or both. ', '"Benrd Bischl","Dominik Kirchhoff","Michel Lang","Jan van Rijn","Joaquin Vanschoren"', NULL, '2014-12-03 00:00:00'),
9-
(8, 'Subgroup Discovery', 'Subgroup discovery is a data mining technique which extracts interesting rules with respect to a target variable. An important characteristic of this task is the combination of predictive and descriptive induction. An overview related to the task of subgroup discovery is presented. (description by: Herrera et. al., An overview on subgroup discovery: foundations and applications)', '"Jan N. van Rijn", "Arno Knobbe", "Joaquin Vanschoren"', NULL, '2016-06-17 10:59:20');
6+
(5, 'Clustering', 'Given an input dataset, the task is to partition it into various clusters.', '\"Mehdi Jamali\", \"Jan van Rijn\", \"Nenad Tomasev\", \"Joaquin Vanschoren\"', NULL, '2014-10-24 00:00:00'),
7+
(6, 'Machine Learning Challenge', 'This is a standard machine learning challenge with a hidden private dataset.\r\nIt offers a labeled training set and an unlabeled test set. \r\n\r\nThe task is to label the unlabeled instances. Only the OpenML server knows the correct labels, and will evaluate the submitted predictions using these hidden labels. The evaluation procedure, measure, and cost function (if any) are provided.', '\"Jan van Rijn\",\"Joaquin Vanschoren\"', NULL, '2014-11-28 00:00:00'),
8+
(7, 'Survival Analysis', 'Related to Regression. Given a dataset (typically consisting of patient data) predict a left timestamp (date entering the study), right timestamp (date of leaving the study), or both. ', '\"Benrd Bischl\",\"Dominik Kirchhoff\",\"Michel Lang\",\"Jan van Rijn\",\"Joaquin Vanschoren\"', NULL, '2014-12-03 00:00:00'),
9+
(8, 'Subgroup Discovery', 'Subgroup discovery is a data mining technique which extracts interesting rules with respect to a target variable. An important characteristic of this task is the combination of predictive and descriptive induction. An overview related to the task of subgroup discovery is presented. (description by: Herrera et. al., An overview on subgroup discovery: foundations and applications)', '\"Jan N. van Rijn\", \"Arno Knobbe\", \"Joaquin Vanschoren\"', NULL, '2016-06-17 10:59:20'),
10+
(9, 'Multitask Regression', '', 'Jan N. van Rijn', NULL, '2019-10-24 23:46:54');

data/sql/task_type_inout.sql

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,7 @@ INSERT INTO `task_type_inout` (`ttid`, `name`, `type`, `io`, `requirement`, `des
5858
(8, 'source_data', 'Dataset', 'input', 'required', 'The input data for this task', 10, '{\r\n\"data_type\": \"numeric\",\r\n\"select\": \"did\",\r\n\"from\": \"dataset\"\r\n}', '<oml:data_set>\r\n<oml:data_set_id>[INPUT:source_data]</oml:data_set_id>\r\n<oml:target_feature>[INPUT:target_feature]</oml:target_feature>\r\n<oml:target_value>[INPUT:target_value]</oml:target_value>\r\n</oml:data_set>', '{\r\n \"name\": \"Dataset(s)\",\r\n \"autocomplete\": \"commaSeparated\",\r\n \"datasource\": \"expdbDatasetVersion()\",\r\n \"placeholder\": \"(*) include all datasets\"\r\n}'),
5959
(8, 'target_feature', 'String', 'input', 'required', 'The name of the dataset feature to be used as the target feature.', 15, '{\r\n\"data_type\": \"string\",\r\n\"select\": \"name\",\r\n\"from\": \"data_feature\",\r\n\"where\": \"did = \\\"[INPUT:source_data]\\\" AND data_type = \\\"nominal\\\"\"\r\n}', NULL, '{\r\n \"placeholder\": \"Use default target\"\r\n}'),
6060
(8, 'target_value', 'String', 'input', 'required', 'The value of the target feature to be used as the SD target value.', 15, '{\r\n\"data_type\": \"string\"\r\n}', NULL, '{\r\n \"placeholder\": \"Use default target value\"\r\n}'),
61-
(8, 'time_limit', 'Integer', 'input', 'required', 'The time limit for SD search', 30, '{\r\n\"data_type\": \"numeric\"\r\n}', '<oml:time_limit>[INPUT:time_limit]</oml:time_limit>', 'NULL');
61+
(8, 'time_limit', 'Integer', 'input', 'required', 'The time limit for SD search', 30, '{\r\n\"data_type\": \"numeric\"\r\n}', '<oml:time_limit>[INPUT:time_limit]</oml:time_limit>', 'NULL'),
62+
(9, 'estimation_procedure', 'Estimation Procedure', 'input', 'required', 'The estimation procedure used to validate the generated models', 20, '{\r\n\"data_type\": \"numeric\",\r\n\"select\": \"id\",\r\n\"from\": \"estimation_procedure\",\r\n\"where\": \"ttid = [TASK:ttid]\"\r\n}', '<oml:estimation_procedure>\r\n<oml:id>[INPUT:estimation_procedure]</oml:id>\r\n<oml:type>[LOOKUP:estimation_procedure.type]</oml:type>\r\n<oml:data_splits_url>[CONSTANT:base_url]/api_splits/get/[TASK:id]/Task_[TASK:id]_splits.arff</oml:data_splits_url>\r\n<oml:parameter name=\"number_repeats\">[LOOKUP:estimation_procedure.repeats]</oml:parameter>\r\n<oml:parameter name=\"number_folds\">[LOOKUP:estimation_procedure.folds]</oml:parameter>\r\n<oml:parameter name=\"number_samples\">[INPUT:number_samples]</oml:parameter>\r\n</oml:estimation_procedure>', '{\r\n \"type\": \"select\",\r\n \"table\": \"estimation_procedure\",\r\n \"key\": \"id\",\r\n \"value\": \"name\"\r\n}'),
63+
(9, 'source_data_list', 'Dataset', 'input', 'required', 'The input data for this task', 10, '{\r\n\"data_type\": \"json\"\r\n}', '<oml:data_set_list>\r\n<oml:data_set_id>[INPUT:source_data_list]</oml:data_set_id>\r\n<oml:target_feature>[INPUT:target_feature]</oml:target_feature>\r\n</oml:data_set_list>', '{\r\n \"name\": \"Dataset(s)\",\r\n \"autocomplete\": \"commaSeparated\",\r\n \"datasource\": \"expdbDatasetVersion()\",\r\n \"placeholder\": \"(*) include all datasets\"\r\n}'),
64+
(9, 'target_feature', 'String', 'input', 'required', 'The name of the dataset feature to be used as the target feature.', 15, '{\r\n\"data_type\": \"string\"\r\n}', NULL, '{\r\n \"default\": \"class\",\r\n \"placeholder\": \"Use default target\"\r\n}');

openml_OS/controllers/Api_splits.php

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ function __construct() {
1717
$this->load->helper('file_upload');
1818

1919
$this->db = $this->load->database('read',true);
20-
$this->task_types = array(1, 2, 3, 6, 7);
20+
$this->task_types = array(1, 2, 3, 6, 7, 9);
2121
$this->challenge_types = array(9);
2222
$this->evaluation = APPPATH . 'third_party/OpenML/Java/evaluate.jar';
2323
$this->eval_engine_config = " -config 'cache_allowed=false;server=".BASE_URL.";api_key=".API_KEY."' ";
@@ -105,21 +105,35 @@ function challenge($task_id, $testtrain, $offset_arg, $size_arg) {
105105
}
106106
}
107107

108+
function merge_datasets($task_id) {
109+
$dir_idx = floor($task_id / $this->content_folder_modulo) * $this->content_folder_modulo;
110+
$directory = $this->directory . '/' . $dir_idx . '/' . $task_id;
111+
112+
$filepath = $directory . '/merged_dataset.arff';
113+
if (file_exists($filepath) == false) {
114+
$this->generate("merge_datasets", $task_id, $filepath);
115+
}
116+
117+
header('Content-type: text/plain');
118+
header('Content-Length: ' . filesize($filepath));
119+
readfile_chunked($filepath);
120+
}
121+
108122
function get($task_id) {
109123
$dir_idx = floor($task_id / $this->content_folder_modulo) * $this->content_folder_modulo;
110124
$directory = $this->directory . '/' . $dir_idx . '/' . $task_id;
111125

112126
$filepath = $directory . '/splits.arff';
113127
if (file_exists($filepath) == false) {
114-
$this->generate($task_id, $filepath);
128+
$this->generate("generate_folds", $task_id, $filepath);
115129
}
116130

117131
header('Content-type: text/plain');
118132
header('Content-Length: ' . filesize($filepath));
119133
readfile_chunked($filepath);
120134
}
121135

122-
private function generate($task_id, $filepath) {
136+
private function generate($function, $task_id, $filepath) {
123137
$task = $this->Task->getById($task_id);
124138
if ($task === false || in_array($task->ttid, $this->task_types) === false) {
125139
http_response_code($this->config->item('general_http_error_code'));
@@ -130,7 +144,7 @@ private function generate($task_id, $filepath) {
130144
// TODO: very important. sanity check input
131145
$testset_str = array_key_exists('custom_testset', $values) && is_cs_natural_numbers($values['custom_testset']) ? '-test "' . $values['custom_testset'] . '"' : '';
132146

133-
$command = 'java -jar ' . $this->evaluation . ' -f "generate_folds" -id ' . $task_id . ' ' . $this->eval_engine_config;
147+
$command = 'java -jar ' . $this->evaluation . ' -f "' . $function . '" -id ' . $task_id . ' ' . $this->eval_engine_config;
134148

135149
if (array_key_exists('custom_testset', $values)) {
136150
$command .= '-test "' . $values['custom_testset'] . '" ';
@@ -142,9 +156,6 @@ private function generate($task_id, $filepath) {
142156

143157
$command .= ' -o ' . $filepath;
144158

145-
//if( $md5 ) $command .= ' -m';
146-
$this->Log->cmd('API Splits::get(' . $task_id . ')', $command);
147-
148159
if (function_enabled('exec')) {
149160
header('Content-type: text/plain');
150161
$result_status = 0;
@@ -155,20 +166,20 @@ private function generate($task_id, $filepath) {
155166

156167
if ($return_status != 0 && defined('EMAIL_API_LOG')) {
157168
$to = EMAIL_API_LOG;
158-
$subject = 'OpenML API Split Generation Exception: ' . $result_status;
169+
$subject = 'OpenML API [' . $function . '] Exception: ' . $result_status;
159170
$content = 'Time: ' . now() . "\nTask_id:" . $task_id . "\nOutput: " . implode("\n", $result);
160171
sendEmail($to, $subject, $content, 'text');
161172
http_response_code($this->config->item('general_http_error_code'));
162-
die('failed to generate arff file. Evaluation Engine result send to EMAIL_API_LOG account.');
173+
die('failed to perform action ' . $function . '. Evaluation Engine result send to EMAIL_API_LOG account.');
163174
}
164175

165176
if ($return_status != 0) {
166177
http_response_code($this->config->item('general_http_error_code'));
167-
die('failed to generate arff file. Evaluation Engine result omitted (no EMAIL_API_LOG defined). ');
178+
die('failed to perform action ' . $function . '. Evaluation Engine result omitted (no EMAIL_API_LOG defined). ');
168179
}
169180
} else {
170181
http_response_code($this->config->item('general_http_error_code'));
171-
die('failed to generate arff file: php "exec" function disabled. ');
182+
die('failed to perform action ' . $function . ': php "exec" function disabled. ');
172183
}
173184
}
174185
}

openml_OS/libraries/ElasticSearch.php

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -761,7 +761,7 @@ private function build_study($d) {
761761
'uploader_id' => $d->creator,
762762
'uploader' => array_key_exists($d->creator, $this->user_names) ? $this->user_names[$d->creator] : 'Unknown',
763763
'visibility' => $d->visibility,
764-
'type' => $d->main_entity_type,
764+
'study_type' => $d->main_entity_type,
765765
'legacy' => $d->legacy,
766766
'suggest' => array(
767767
'input' => array($d->name, $d->description . ' '),
@@ -890,15 +890,15 @@ private function build_task($d) {
890890
$did = 0;
891891
if ($task) {
892892
foreach ($task as $t) {
893-
if ($t->type == 'Dataset') {
893+
if ($t->input == 'source_data') {
894894
$description[] = $this->data_names[$t->value];
895895
$newdata[$t->input] = array(
896896
'type' => $t->type,
897897
'data_id' => $t->value,
898898
'name' => $this->data_names[$t->value]
899899
);
900900
$did = $t->value;
901-
} else if ($t->type == 'Estimation Procedure') {
901+
} else if ($t->input == 'estimation_procedure') {
902902
$description[] = $this->procedure_names[$t->value];
903903
$newdata[$t->input] = array(
904904
'type' => $t->type,
@@ -1064,10 +1064,12 @@ private function fetch_classes($id = false) {
10641064

10651065
private function fetch_runfiles($min, $max) {
10661066
$index = array();
1067-
foreach ($this->db->query('SELECT source, field, name, format, file_id from runfile where source >= ' . $min . ' and source < ' . $max) as $r) {
1068-
$index[$r->source][$r->field]['url'] = BASE_URL . 'data/download/' . $r->file_id . '/' . $r->name;
1069-
$index[$r->source][$r->field]['format'] = $r->format;
1070-
}
1067+
$runfiles = $this->db->query('SELECT source, field, name, format, file_id from runfile where source >= ' . $min . ' and source < ' . $max);
1068+
if ($runfiles)
1069+
foreach ($runfiles as $r) {
1070+
$index[$r->source][$r->field]['url'] = BASE_URL . 'data/download/' . $r->file_id . '/' . $r->name;
1071+
$index[$r->source][$r->field]['format'] = $r->format;
1072+
}
10711073
return $index;
10721074
}
10731075

@@ -1439,16 +1441,16 @@ private function build_task_type($d) {
14391441
);
14401442

14411443
$inputs = $this->db->query('SELECT name, type, description, io, requirement FROM task_type_inout where ttid=' . $d->ttid);
1442-
1443-
foreach ($inputs as $i) {
1444-
$new_data['input'][] = array(
1445-
'name' => $i->name,
1446-
'type' => $i->type,
1447-
'description' => $i->description,
1448-
'io' => $i->io,
1449-
'requirement' => $i->requirement
1450-
);
1451-
}
1444+
if ($inputs)
1445+
foreach ($inputs as $i) {
1446+
$new_data['input'][] = array(
1447+
'name' => $i->name,
1448+
'type' => $i->type,
1449+
'description' => $i->description,
1450+
'io' => $i->io,
1451+
'requirement' => $i->requirement
1452+
);
1453+
}
14521454
return $new_data;
14531455
}
14541456

@@ -1767,15 +1769,16 @@ public function index_single_dataset($id) {
17671769
if ($id and ! $datasets)
17681770
return 'Error: data set ' . $id . ' is unknown';
17691771

1770-
foreach ($datasets as $d) {
1771-
$params['body'][] = array(
1772-
'index' => array(
1773-
'_id' => $d->did
1774-
)
1775-
);
1772+
if ($datasets)
1773+
foreach ($datasets as $d) {
1774+
$params['body'][] = array(
1775+
'index' => array(
1776+
'_id' => $d->did
1777+
)
1778+
);
17761779

1777-
$params['body'][] = $this->build_data($d);
1778-
}
1780+
$params['body'][] = $this->build_data($d);
1781+
}
17791782

17801783
$responses = $this->client->bulk($params);
17811784

openml_OS/models/Algorithm_setup.php

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,32 @@ function createSetup($implementation, $parameters, $setup_string) {
138138
}
139139
return $setupId;
140140
}
141+
142+
public function setup_ids_to_parameter_values($setups) {
143+
// query fails for classifiers without parameters. is fixed further on.
144+
$this->db->select('input.*, input_setting.*, `implementation`.`name` AS `flow_name`, `implementation`.`fullName` AS `flow_fullName`')->from('input_setting');
145+
$this->db->join('input', 'input_setting.input_id = input.id', 'inner');
146+
$this->db->join('implementation', 'input.implementation_id = implementation.id', 'inner');
147+
// note that algorithm setup can not be linked to implementation id, otherwise we will only get parameters of the root classifier
148+
$this->db->join('algorithm_setup', 'algorithm_setup.sid = input_setting.setup', 'inner');
149+
$this->db->join('setup_tag', 'input_setting.setup = setup_tag.id', 'left');
150+
$this->db->where_in('algorithm_setup.sid', $setups);
151+
152+
$query = $this->db->get();
153+
$parameters = $query->result();
154+
155+
$per_setup = array();
156+
// initialize the array
157+
foreach ($setups as $setup) {
158+
$per_setup[$setup] = array();
159+
}
160+
// now fill with parameters
161+
foreach ($parameters as $parameter) {
162+
$per_setup[$parameter->setup][] = $parameter;
163+
}
164+
165+
return $per_setup;
166+
}
141167
}
142168

143169
?>

0 commit comments

Comments
 (0)