openml
diff --git a/‎data/sql/estimation_procedure.sql‎
Lines changed: 2 additions & 1 deletion b/‎data/sql/estimation_procedure.sql‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎data/sql/math_function.sql‎
Lines changed: 74 additions & 74 deletions b/‎data/sql/math_function.sql‎
Lines changed: 74 additions & 74 deletions
diff --git a/‎data/sql/task_type.sql‎
Lines changed: 5 additions & 4 deletions b/‎data/sql/task_type.sql‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎data/sql/task_type_inout.sql‎
Lines changed: 4 additions & 1 deletion b/‎data/sql/task_type_inout.sql‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎openml_OS/controllers/Api_splits.php‎
Lines changed: 22 additions & 11 deletions b/‎openml_OS/controllers/Api_splits.php‎
Lines changed: 22 additions & 11 deletions
diff --git a/‎openml_OS/libraries/ElasticSearch.php‎
Lines changed: 28 additions & 25 deletions b/‎openml_OS/libraries/ElasticSearch.php‎
Lines changed: 28 additions & 25 deletions
diff --git a/‎openml_OS/models/Algorithm_setup.php‎
Lines changed: 26 additions & 0 deletions b/‎openml_OS/models/Algorithm_setup.php‎
Lines changed: 26 additions & 0 deletions
@@ -26,4 +26,5 @@ INSERT INTO `estimation_procedure` (`id`, `ttid`, `name`, `type`, `repeats`, `fo
 (25, 1, '4-fold Crossvalidation', 'crossvalidation', 1, 4, 'false', NULL, 'true', 'false', '2016-03-15 13:32:10'),
 (26, 1, 'Test on Training Data', 'testontrainingdata', NULL, NULL, 'false', NULL, NULL, 'false', '2019-03-16 11:30:14'),
 (27, 2, 'Test on Training Data', 'testontrainingdata', NULL, NULL, 'false', NULL, NULL, 'false', '2019-03-16 11:30:14'),
-(28, 1, '20% Holdout (Ordered)', 'holdout_ordered', 1, 1, 'false', 20, NULL, 'false', '2019-05-23 12:40:53');
+(28, 1, '20% Holdout (Ordered)', 'holdout_ordered', 1, 1, 'false', 20, NULL, 'false', '2019-05-23 12:40:53'),
+(29, 9, '10-fold Crossvalidation', 'crossvalidation', 1, 10, 'false', NULL, 'true', 'false', '2014-12-31 20:00:00');
@@ -3,7 +3,8 @@ INSERT INTO `task_type` (`ttid`, `name`, `description`, `creator`, `contributors
 (2, 'Supervised Regression', 'Given a dataset with a numeric target and a set of train/test splits, e.g. generated by a cross-validation procedure, train a model and return the predictions of that model.', 'Joaquin Vanschoren, Jan van Rijn, Luis Torgo, Bernd Bischl', 'Bo Gao, Simon Fischer, Venkatesh Umaashankar, Michael Berthold, Bernd Wiswedel ,Patrick Winter', '2013-02-13 00:00:00'),
 (3, 'Learning Curve', 'Given a dataset with a nominal target, various data samples of increasing size are defined. A model is build for each individual data sample; from this a learning curve can be drawn. ', 'Pavel Brazdil, Jan van Rijn, Joaquin Vanschoren', NULL, '2014-01-21 00:00:00'),
 (4, 'Supervised Data Stream Classification', 'Given a dataset with a nominal target, various data samples of increasing size are defined. A model is build for each individual data sample; from this a learning curve can be drawn.', 'Geoffrey Holmes, Bernhard Pfahringer, Jan van Rijn, Joaquin Vanschoren', NULL, '2014-03-01 00:00:00'),
-(5, 'Clustering', 'Given an input dataset, the task is to partition it into various clusters.', '"Mehdi Jamali", "Jan van Rijn", "Nenad Tomasev", "Joaquin Vanschoren"', NULL, '2014-10-24 00:00:00'),
-(6, 'Machine Learning Challenge', 'This is a standard machine learning challenge with a hidden private dataset.\r\nIt offers a labeled training set and an unlabeled test set. \r\n\r\nThe task is to label the unlabeled instances. Only the OpenML server knows the correct labels, and will evaluate the submitted predictions using these hidden labels. The evaluation procedure, measure, and cost function (if any) are provided.', '"Jan van Rijn","Joaquin Vanschoren"', NULL, '2014-11-28 00:00:00'),
-(7, 'Survival Analysis', 'Related to Regression. Given a dataset (typically consisting of patient data) predict a left timestamp (date entering the study), right timestamp (date of leaving the study), or both. ', '"Benrd Bischl","Dominik Kirchhoff","Michel Lang","Jan van Rijn","Joaquin Vanschoren"', NULL, '2014-12-03 00:00:00'),
-(8, 'Subgroup Discovery', 'Subgroup discovery is a data mining technique which extracts interesting rules with respect to a target variable. An important characteristic of this task is the combination of predictive and descriptive induction. An overview related to the task of subgroup discovery is presented. (description by: Herrera et. al., An overview on subgroup discovery: foundations and applications)', '"Jan N. van Rijn", "Arno Knobbe", "Joaquin Vanschoren"', NULL, '2016-06-17 10:59:20');
+(5, 'Clustering', 'Given an input dataset, the task is to partition it into various clusters.', '\"Mehdi Jamali\", \"Jan van Rijn\", \"Nenad Tomasev\", \"Joaquin Vanschoren\"', NULL, '2014-10-24 00:00:00'),
+(6, 'Machine Learning Challenge', 'This is a standard machine learning challenge with a hidden private dataset.\r\nIt offers a labeled training set and an unlabeled test set. \r\n\r\nThe task is to label the unlabeled instances. Only the OpenML server knows the correct labels, and will evaluate the submitted predictions using these hidden labels. The evaluation procedure, measure, and cost function (if any) are provided.', '\"Jan van Rijn\",\"Joaquin Vanschoren\"', NULL, '2014-11-28 00:00:00'),
+(7, 'Survival Analysis', 'Related to Regression. Given a dataset (typically consisting of patient data) predict a left timestamp (date entering the study), right timestamp (date of leaving the study), or both. ', '\"Benrd Bischl\",\"Dominik Kirchhoff\",\"Michel Lang\",\"Jan van Rijn\",\"Joaquin Vanschoren\"', NULL, '2014-12-03 00:00:00'),
+(8, 'Subgroup Discovery', 'Subgroup discovery is a data mining technique which extracts interesting rules with respect to a target variable. An important characteristic of this task is the combination of predictive and descriptive induction. An overview related to the task of subgroup discovery is presented. (description by: Herrera et. al., An overview on subgroup discovery: foundations and applications)', '\"Jan N. van Rijn\", \"Arno Knobbe\", \"Joaquin Vanschoren\"', NULL, '2016-06-17 10:59:20'),
+(9, 'Multitask Regression', '', 'Jan N. van Rijn', NULL, '2019-10-24 23:46:54');
@@ -58,4 +58,7 @@ INSERT INTO `task_type_inout` (`ttid`, `name`, `type`, `io`, `requirement`, `des
 (8, 'source_data', 'Dataset', 'input', 'required', 'The input data for this task', 10, '{\r\n\"data_type\": \"numeric\",\r\n\"select\": \"did\",\r\n\"from\": \"dataset\"\r\n}', '<oml:data_set>\r\n<oml:data_set_id>[INPUT:source_data]</oml:data_set_id>\r\n<oml:target_feature>[INPUT:target_feature]</oml:target_feature>\r\n<oml:target_value>[INPUT:target_value]</oml:target_value>\r\n</oml:data_set>', '{\r\n  \"name\": \"Dataset(s)\",\r\n  \"autocomplete\": \"commaSeparated\",\r\n  \"datasource\": \"expdbDatasetVersion()\",\r\n  \"placeholder\": \"(*) include all datasets\"\r\n}'),
 (8, 'target_feature', 'String', 'input', 'required', 'The name of the dataset feature to be used as the target feature.', 15, '{\r\n\"data_type\": \"string\",\r\n\"select\": \"name\",\r\n\"from\": \"data_feature\",\r\n\"where\": \"did = \\\"[INPUT:source_data]\\\" AND data_type = \\\"nominal\\\"\"\r\n}', NULL, '{\r\n  \"placeholder\": \"Use default target\"\r\n}'),
 (8, 'target_value', 'String', 'input', 'required', 'The value of the target feature to be used as the SD target value.', 15, '{\r\n\"data_type\": \"string\"\r\n}', NULL, '{\r\n  \"placeholder\": \"Use default target value\"\r\n}'),
-(8, 'time_limit', 'Integer', 'input', 'required', 'The time limit for SD search', 30, '{\r\n\"data_type\": \"numeric\"\r\n}', '<oml:time_limit>[INPUT:time_limit]</oml:time_limit>', 'NULL');
+(8, 'time_limit', 'Integer', 'input', 'required', 'The time limit for SD search', 30, '{\r\n\"data_type\": \"numeric\"\r\n}', '<oml:time_limit>[INPUT:time_limit]</oml:time_limit>', 'NULL'),
+(9, 'estimation_procedure', 'Estimation Procedure', 'input', 'required', 'The estimation procedure used to validate the generated models', 20, '{\r\n\"data_type\": \"numeric\",\r\n\"select\": \"id\",\r\n\"from\": \"estimation_procedure\",\r\n\"where\": \"ttid = [TASK:ttid]\"\r\n}', '<oml:estimation_procedure>\r\n<oml:id>[INPUT:estimation_procedure]</oml:id>\r\n<oml:type>[LOOKUP:estimation_procedure.type]</oml:type>\r\n<oml:data_splits_url>[CONSTANT:base_url]/api_splits/get/[TASK:id]/Task_[TASK:id]_splits.arff</oml:data_splits_url>\r\n<oml:parameter name=\"number_repeats\">[LOOKUP:estimation_procedure.repeats]</oml:parameter>\r\n<oml:parameter name=\"number_folds\">[LOOKUP:estimation_procedure.folds]</oml:parameter>\r\n<oml:parameter name=\"number_samples\">[INPUT:number_samples]</oml:parameter>\r\n</oml:estimation_procedure>', '{\r\n  \"type\": \"select\",\r\n  \"table\": \"estimation_procedure\",\r\n  \"key\": \"id\",\r\n  \"value\": \"name\"\r\n}'),
+(9, 'source_data_list', 'Dataset', 'input', 'required', 'The input data for this task', 10, '{\r\n\"data_type\": \"json\"\r\n}', '<oml:data_set_list>\r\n<oml:data_set_id>[INPUT:source_data_list]</oml:data_set_id>\r\n<oml:target_feature>[INPUT:target_feature]</oml:target_feature>\r\n</oml:data_set_list>', '{\r\n  \"name\": \"Dataset(s)\",\r\n  \"autocomplete\": \"commaSeparated\",\r\n  \"datasource\": \"expdbDatasetVersion()\",\r\n  \"placeholder\": \"(*) include all datasets\"\r\n}'),
+(9, 'target_feature', 'String', 'input', 'required', 'The name of the dataset feature to be used as the target feature.', 15, '{\r\n\"data_type\": \"string\"\r\n}', NULL, '{\r\n  \"default\": \"class\",\r\n  \"placeholder\": \"Use default target\"\r\n}');
@@ -17,7 +17,7 @@ function __construct() {
     $this->load->helper('file_upload');
 
     $this->db = $this->load->database('read',true);
-    $this->task_types = array(1, 2, 3, 6, 7);
+    $this->task_types = array(1, 2, 3, 6, 7, 9);
     $this->challenge_types = array(9);
     $this->evaluation = APPPATH . 'third_party/OpenML/Java/evaluate.jar';
     $this->eval_engine_config = " -config 'cache_allowed=false;server=".BASE_URL.";api_key=".API_KEY."' ";
@@ -105,21 +105,35 @@ function challenge($task_id, $testtrain, $offset_arg, $size_arg) {
     }
   }
 
+  function merge_datasets($task_id) {
+    $dir_idx = floor($task_id / $this->content_folder_modulo) * $this->content_folder_modulo;
+    $directory = $this->directory . '/' . $dir_idx . '/' . $task_id;
+    
+    $filepath = $directory . '/merged_dataset.arff';
+    if (file_exists($filepath) == false) {
+      $this->generate("merge_datasets", $task_id, $filepath);
+    }
+    
+    header('Content-type: text/plain');
+    header('Content-Length: ' . filesize($filepath));
+    readfile_chunked($filepath);
+  }
+  
   function get($task_id) {
     $dir_idx = floor($task_id / $this->content_folder_modulo) * $this->content_folder_modulo;
     $directory = $this->directory . '/' . $dir_idx . '/' . $task_id;
 
     $filepath = $directory . '/splits.arff';
     if (file_exists($filepath) == false) {
-      $this->generate($task_id, $filepath);
+      $this->generate("generate_folds", $task_id, $filepath);
     }
 
     header('Content-type: text/plain');
     header('Content-Length: ' . filesize($filepath));
     readfile_chunked($filepath);
   }
 
-  private function generate($task_id, $filepath) {
+  private function generate($function, $task_id, $filepath) {
     $task = $this->Task->getById($task_id);
     if ($task === false || in_array($task->ttid, $this->task_types) === false) {
       http_response_code($this->config->item('general_http_error_code'));
@@ -130,7 +144,7 @@ private function generate($task_id, $filepath) {
     // TODO: very important. sanity check input
     $testset_str = array_key_exists('custom_testset', $values) && is_cs_natural_numbers($values['custom_testset']) ? '-test "' . $values['custom_testset'] . '"' : '';
 
-    $command = 'java -jar ' . $this->evaluation . ' -f "generate_folds" -id ' . $task_id . ' ' . $this->eval_engine_config;
+    $command = 'java -jar ' . $this->evaluation . ' -f "' . $function . '" -id ' . $task_id . ' ' . $this->eval_engine_config;
 
     if (array_key_exists('custom_testset', $values)) {
       $command .= '-test "' . $values['custom_testset'] . '" ';
@@ -142,9 +156,6 @@ private function generate($task_id, $filepath) {
 
     $command .= ' -o ' . $filepath;
 
-    //if( $md5 ) $command .= ' -m';
-    $this->Log->cmd('API Splits::get(' . $task_id . ')', $command);
-    
     if (function_enabled('exec')) {
       header('Content-type: text/plain');
       $result_status = 0;
@@ -155,20 +166,20 @@ private function generate($task_id, $filepath) {
 
       if ($return_status != 0 && defined('EMAIL_API_LOG')) {
         $to      = EMAIL_API_LOG;
-        $subject = 'OpenML API Split Generation Exception: ' . $result_status;
+        $subject = 'OpenML API [' . $function . '] Exception: ' . $result_status;
         $content = 'Time: ' . now() . "\nTask_id:" . $task_id . "\nOutput: " . implode("\n", $result);
         sendEmail($to, $subject, $content, 'text');
         http_response_code($this->config->item('general_http_error_code'));
-        die('failed to generate arff file. Evaluation Engine result send to EMAIL_API_LOG account.');
+        die('failed to perform action ' . $function . '. Evaluation Engine result send to EMAIL_API_LOG account.');
       }
 
       if ($return_status != 0) {
         http_response_code($this->config->item('general_http_error_code'));
-        die('failed to generate arff file. Evaluation Engine result omitted (no EMAIL_API_LOG defined). ');
+        die('failed to perform action ' . $function . '. Evaluation Engine result omitted (no EMAIL_API_LOG defined). ');
       }
     } else {
       http_response_code($this->config->item('general_http_error_code'));
-      die('failed to generate arff file: php "exec" function disabled. ');
+      die('failed to perform action ' . $function . ': php "exec" function disabled. ');
     }
   }
 }
 
@@ -761,7 +761,7 @@ private function build_study($d) {
             'uploader_id' => $d->creator,
             'uploader' => array_key_exists($d->creator, $this->user_names) ? $this->user_names[$d->creator] : 'Unknown',
             'visibility' => $d->visibility,
-            'type' => $d->main_entity_type,
+            'study_type' => $d->main_entity_type,
             'legacy' => $d->legacy,
             'suggest' => array(
                 'input' => array($d->name, $d->description . ' '),
@@ -890,15 +890,15 @@ private function build_task($d) {
         $did = 0;
         if ($task) {
             foreach ($task as $t) {
-                if ($t->type == 'Dataset') {
+                if ($t->input == 'source_data') {
                     $description[] = $this->data_names[$t->value];
                     $newdata[$t->input] = array(
                         'type' => $t->type,
                         'data_id' => $t->value,
                         'name' => $this->data_names[$t->value]
                     );
                     $did = $t->value;
-                } else if ($t->type == 'Estimation Procedure') {
+                } else if ($t->input == 'estimation_procedure') {
                     $description[] = $this->procedure_names[$t->value];
                     $newdata[$t->input] = array(
                         'type' => $t->type,
@@ -1064,10 +1064,12 @@ private function fetch_classes($id = false) {
 
     private function fetch_runfiles($min, $max) {
         $index = array();
-        foreach ($this->db->query('SELECT source, field, name, format, file_id from runfile where source >= ' . $min . ' and source < ' . $max) as $r) {
-            $index[$r->source][$r->field]['url'] = BASE_URL . 'data/download/' . $r->file_id . '/' . $r->name;
-            $index[$r->source][$r->field]['format'] = $r->format;
-        }
+        $runfiles = $this->db->query('SELECT source, field, name, format, file_id from runfile where source >= ' . $min . ' and source < ' . $max);
+        if ($runfiles)
+          foreach ($runfiles as $r) {
+              $index[$r->source][$r->field]['url'] = BASE_URL . 'data/download/' . $r->file_id . '/' . $r->name;
+              $index[$r->source][$r->field]['format'] = $r->format;
+          }
         return $index;
     }
 
@@ -1439,16 +1441,16 @@ private function build_task_type($d) {
         );
 
         $inputs = $this->db->query('SELECT name, type, description, io, requirement FROM task_type_inout where ttid=' . $d->ttid);
-
-        foreach ($inputs as $i) {
-            $new_data['input'][] = array(
-                'name' => $i->name,
-                'type' => $i->type,
-                'description' => $i->description,
-                'io' => $i->io,
-                'requirement' => $i->requirement
-            );
-        }
+        if ($inputs)
+          foreach ($inputs as $i) {
+              $new_data['input'][] = array(
+                  'name' => $i->name,
+                  'type' => $i->type,
+                  'description' => $i->description,
+                  'io' => $i->io,
+                  'requirement' => $i->requirement
+              );
+          }
         return $new_data;
     }
 
@@ -1767,15 +1769,16 @@ public function index_single_dataset($id) {
         if ($id and ! $datasets)
             return 'Error: data set ' . $id . ' is unknown';
 
-        foreach ($datasets as $d) {
-            $params['body'][] = array(
-                'index' => array(
-                    '_id' => $d->did
-                )
-            );
+        if ($datasets)
+          foreach ($datasets as $d) {
+              $params['body'][] = array(
+                  'index' => array(
+                      '_id' => $d->did
+                  )
+              );
 
-            $params['body'][] = $this->build_data($d);
-        }
+              $params['body'][] = $this->build_data($d);
+          }
 
         $responses = $this->client->bulk($params);
 
 
@@ -138,6 +138,32 @@ function createSetup($implementation, $parameters, $setup_string) {
     }
     return $setupId;
   }
+  
+  public function setup_ids_to_parameter_values($setups) {
+    // query fails for classifiers without parameters. is fixed further on.
+    $this->db->select('input.*, input_setting.*, `implementation`.`name` AS `flow_name`, `implementation`.`fullName` AS `flow_fullName`')->from('input_setting');
+    $this->db->join('input', 'input_setting.input_id = input.id', 'inner');
+    $this->db->join('implementation', 'input.implementation_id = implementation.id', 'inner');
+    // note that algorithm setup can not be linked to implementation id, otherwise we will only get parameters of the root classifier
+    $this->db->join('algorithm_setup', 'algorithm_setup.sid = input_setting.setup', 'inner');
+    $this->db->join('setup_tag', 'input_setting.setup = setup_tag.id', 'left');
+    $this->db->where_in('algorithm_setup.sid', $setups);
+
+    $query = $this->db->get();
+    $parameters = $query->result();
+
+    $per_setup = array();
+    // initialize the array
+    foreach ($setups as $setup) {
+      $per_setup[$setup] = array();
+    }
+    // now fill with parameters
+    foreach ($parameters as $parameter) {
+      $per_setup[$parameter->setup][] = $parameter;
+    }
+    
+    return $per_setup;
+  }
 }
 
 ?>