add validations

Piotr Plonski · Piotr Plonski · commit e24ed0c11e09 · 2017-03-20T15:03:45.000+01:00
diff --git a/mljar/client/experiment.py b/mljar/client/experiment.py
@@ -1,4 +1,5 @@
 import json
+import warnings
 from base import MljarHttpClient
 from ..model.experiment import Experiment
 from ..exceptions import NotFoundException, MljarException, CreateExperimentException
@@ -8,7 +9,7 @@
 from ..log import logger
 
 from ..utils import make_hash
-from ..utils import MLJAR_VALIDATIONS, MLJAR_METRICS, MLJAR_TUNING_MODES, MLJAR_DEFAULT_ALGORITHMS, MLJAR_DEFAULT_METRICS
+from ..utils import MLJAR_METRICS, MLJAR_TUNING_MODES, MLJAR_DEFAULT_ALGORITHMS, MLJAR_DEFAULT_METRICS
 
 class ExperimentClient(MljarHttpClient):
     '''
@@ -45,13 +46,32 @@ def create_experiment(self, data):
             raise CreateExperimentException()
         return Experiment.from_dict(response.json())
 
-    def add_experiment_if_not_exists(self, train_dataset, experiment_title, project_task, \
-                                        validation, algorithms, metric, \
+    def add_experiment_if_not_exists(self, train_dataset, vald_dataset, experiment_title, project_task, \
+                                        validation_kfolds, validation_shuffle, \
+                                        validation_stratify, validation_train_split, \
+                                        algorithms, metric, \
                                         tuning_mode, time_constraint, create_ensemble):
         logger.info('Add experiment if not exists')
         # parameters validation
-        if validation is None or validation == '' or validation not in MLJAR_VALIDATIONS:
-            validation = MLJAR_DEFAULT_VALIDATION
+        # validation with dataset
+        if vald_dataset is not None:
+            validation = "With dataset"
+        else:
+            # do train/validation split
+            if validation_train_split is not None:
+                percents = int(validation_train_split * 100.0)
+                validation = "Split {}/{}".format(percents, 100-percents)
+            else:
+                validation = "{}-fold CV".format(validation_kfolds)
+
+            # shuffle and stratify
+            if validation_shuffle:
+                validation += ", Shuffle"
+            if validation_stratify and project_task == 'bin_class':
+                validation += ", Stratify"
+            if validation_stratify and project_task != 'bin_class':
+                warnings.warn('Cannot use stratify in validation for your project task. Omitting this option in validation.')
+
         if metric is None or metric == '' or metric not in MLJAR_METRICS:
             metric = MLJAR_DEFAULT_METRICS[project_task]
         if tuning_mode is None or tuning_mode == '' or tuning_mode not in MLJAR_TUNING_MODES:
@@ -67,18 +87,22 @@ def add_experiment_if_not_exists(self, train_dataset, experiment_title, project_
             dataset_preproc['convert_categorical'] = 'categorical_to_int'
         # create stub for new experiment
         logger.info('Create new experiment stub')
+        expt_params = {
+                "train_dataset": {"id": train_dataset.hid, 'title': train_dataset.title},
+                "algs":algorithms,
+                "preproc": dataset_preproc,
+                "single_limit":time_constraint,
+                "ensemble":create_ensemble,
+                "random_start_cnt": MLJAR_TUNING_MODES[tuning_mode]['random_start_cnt'],
+                "hill_climbing_cnt": MLJAR_TUNING_MODES[tuning_mode]['hill_climbing_cnt']
+                }
+        if vald_dataset is not None:
+            expt_params['vald_dataset'] = {"id": vald_dataset.hid, 'title': vald_dataset.title}
+
         new_expt = Experiment(hid='', title=experiment_title, models_cnt=0, task=project_task,
                                 description='', metric=metric, validation_scheme=validation,
                                 total_timelog=0, bestalg=[], details={},
-                                params={
-                                        "train_dataset": {"id": train_dataset.hid, 'title': train_dataset.title},
-                                        "algs":algorithms,
-                                        "preproc": dataset_preproc,
-                                        "single_limit":time_constraint,
-                                        "ensemble":create_ensemble,
-                                        "random_start_cnt": MLJAR_TUNING_MODES[tuning_mode]['random_start_cnt'],
-                                        "hill_climbing_cnt": MLJAR_TUNING_MODES[tuning_mode]['hill_climbing_cnt']
-                                        },
+                                params=expt_params,
                                 compute_now=0, computation_started_at=None, created_at=None,
                                 created_by=None, parent_project=self.project_hid)
 
diff --git a/mljar/mljar.py b/mljar/mljar.py
@@ -26,7 +26,10 @@ class Mljar(object):
     def __init__(self, project, experiment,
                         metric = '',
                         algorithms = [],
-                        validation  = MLJAR_DEFAULT_VALIDATION,
+                        validation_kfolds = MLJAR_DEFAULT_FOLDS,
+                        validation_shuffle = MLJAR_DEFAULT_SHUFFLE,
+                        validation_stratify = MLJAR_DEFAULT_STRATIFY,
+                        validation_train_split = MLJAR_DEFAULT_TRAIN_SPLIT,
                         tuning_mode = MLJAR_DEFAULT_TUNING_MODE,
                         create_ensemble  = MLJAR_DEFAULT_ENSEMBLE,
                         single_algorithm_time_limit = MLJAR_DEFAULT_TIME_CONSTRAINT):
@@ -60,12 +63,20 @@ def __init__(self, project, experiment,
                      - rmse which is Root Mean Square Error
                      - mse which is for Mean Square Error
                      - mase which is for Mean Absolute Error
-            validation: The schema of validation that will be used for model search and tuning. There is only available
-                        validation with cross validation. Proper values are:
-                         - 3fold for 3-fold Stratified CV
-                         - 5fold for 5-fold Stratified CV
-                         - 10fold for 10-fold Stratified CV
-                        The default is 5-fold CV.
+            validation_kfolds: The number of folds to be used in validation,
+                            it is omitted if validation_train_split is not None
+                            or there is validation dataset provided.
+                            It can be number from 2 to 15.
+            validation_shuffle: The boolean which specify if shuffle samples before training.
+                            It is used in k-fold CV and in validation split. Default is set True.
+                            It is ignored when validating with separate dataset.
+            validation_stratify: The boolean which decides whether samples will be
+                            divided into folds with the same class distribution.
+                            In regression tasks this flag is ignored. Default is set to True.
+            validation_train_split: The ratio how to split training dataset into train and validation.
+                            This ratio specify what ratio from input data should be used in training.
+                            It should be from (0.05,0.95) range. If it is not None, then
+                            validation_kfolds variable is ignored.
             single_algorithm_time_limit: The time in minutes that will be spend for training single algorithm.
                         Default value is 5 minutes.
         '''
@@ -89,19 +100,34 @@ def __init__(self, project, experiment,
         # below params are validated later
         self.algorithms = algorithms
         self.metric = metric
-        self.validation = validation
         self.single_algorithm_time_limit = single_algorithm_time_limit
         self.wait_till_all_done = True
         self.selected_algorithm = None
         self.project = None
         self.experiment = None
 
-    def fit(self, X, y, wait_till_all_done = True):
+        self.validation_kfolds = validation_kfolds
+        self.validation_shuffle = validation_shuffle
+        self.validation_stratify = validation_stratify
+        self.validation_train_split = validation_train_split
+
+        if self.validation_kfolds is not None:
+            if self.validation_kfolds < 2 or self.validation_kfolds > 15:
+                raise MljarException('Wrong validation_kfolds parameter value, it should be in [2, 15] range.')
+
+        if self.validation_train_split is not None:
+            if self.validation_train_split < 0.05 or self.validation_train_split > 0.95:
+                raise MljarException('Wrong validation_train_split parameter value, it should be in (0.05, 0.95) range.')
+
+
+    def fit(self, X, y, validation_data = None, wait_till_all_done = True):
         '''
         Fit models with MLJAR engine.
         Args:
             X: The numpy or pandas matrix with training data.
             y: The numpy or pandas vector with target values.
+            validation_data: Tuple (X,y) with validation data.If set to None, then
+                                the k-fold CV or train split validation will be used.
             wait_till_all_done: The flag which decides if fit function will wait
                                 till experiment is done.
         '''
@@ -113,12 +139,12 @@ def fit(self, X, y, wait_till_all_done = True):
             raise IncorrectInputDataException('Sorry, there is a missmatch between X and y matrices shapes')
 
         try:
-            self._start_experiment(X, y)
+            self._start_experiment(X, y, validation_data)
         except Exception as e:
             print 'Ups, %s' % str(e)
 
 
-    def _start_experiment(self, X, y):
+    def _start_experiment(self, X, y, validation_data = None):
 
         # define project task
         self.project_task = 'bin_class' if len(np.unique(y)) == 2 else 'reg'
@@ -130,14 +156,25 @@ def _start_experiment(self, X, y):
         #
         # add a dataset to project
         #
-        logger.info('MLJAR: add dataset')
+        logger.info('MLJAR: add training dataset')
         self.dataset = DatasetClient(self.project.hid).add_dataset_if_not_exists(X, y)
+
+        self.dataset_vald = None
+        if validation_data is not None:
+            if len(validation_data) == 2:
+                raise MljarException('Wrong format of validation data. It should be tuple (X,y)')
+            logger.info('MLJAR: add validation dataset')
+            X_vald, y_vald = validation_data
+            self.dataset_vald = DatasetClient(self.project.hid).add_dataset_if_not_exists(X_vald, y_vald)
         #
         # add experiment to project
         #
         logger.info('MLJAR: add experiment')
-        self.experiment = ExperimentClient(self.project.hid).add_experiment_if_not_exists(self.dataset, self.experiment_title, self.project_task, \
-                                                    self.validation, self.algorithms, self.metric, \
+        self.experiment = ExperimentClient(self.project.hid).add_experiment_if_not_exists(self.dataset, self.dataset_vald, \
+                                                    self.experiment_title, self.project_task, \
+                                                    self.validation_kfolds, self.validation_shuffle, \
+                                                    self.validation_stratify, self.validation_train_split, \
+                                                    self.algorithms, self.metric, \
                                                     self.tuning_mode, self.single_algorithm_time_limit, self.create_ensemble)
         if self.experiment is None:
             raise UndefinedExperimentException()
diff --git a/mljar/utils.py b/mljar/utils.py
@@ -18,12 +18,11 @@
             'mae'    : 'Mean Absolute Error'
             }
 
-MLJAR_VALIDATIONS = {
-            "3fold" : "3-fold Stratified CV",
-            "5fold" : "5-fold Stratified CV",
-            "10fold": "10-fold Stratified CV",
-            "with_validation": "validation dataset"
-            }
+MLJAR_DEFAULT_FOLDS = 5
+MLJAR_DEFAULT_SHUFFLE = True
+MLJAR_DEFAULT_STRATIFY = True
+MLJAR_DEFAULT_TRAIN_SPLIT = None
+
 
 MLJAR_BIN_CLASS = {
             "xgb"   :"Extreme Gradient Boosting",
@@ -63,7 +62,6 @@
             'regression': ['xgbr', 'lgbr']
             }
 
-MLJAR_DEFAULT_VALIDATION      = '5fold'
 MLJAR_DEFAULT_ENSEMBLE        = True
 MLJAR_DEFAULT_TUNING_MODE     = 'Normal'
 MLJAR_DEFAULT_TIME_CONSTRAINT = '5' # minutes
diff --git a/tests/data/test_1_vald.csv b/tests/data/test_1_vald.csv
@@ -0,0 +1,91 @@
+sepal length,sepal width,petal length,petal width,class
+5.1,3.5,1.4,0.2,0
+4.9,3.0,1.4,0.2,0
+4.7,3.2,1.3,0.2,0
+4.6,3.1,1.5,0.2,0
+5.0,3.6,1.4,0.2,0
+5.4,3.9,1.7,0.4,0
+4.6,3.4,1.4,0.3,0
+5.0,3.4,1.5,0.2,0
+4.4,2.9,1.4,0.2,0
+4.9,3.1,1.5,0.1,0
+5.4,3.7,1.5,0.2,0
+4.8,3.4,1.6,0.2,0
+4.8,3.0,1.4,0.1,0
+4.3,3.0,1.1,0.1,0
+5.8,4.0,1.2,0.2,0
+5.7,4.4,1.5,0.4,0
+5.4,3.9,1.3,0.4,0
+5.1,3.5,1.4,0.3,0
+5.7,3.8,1.7,0.3,0
+5.1,3.8,1.5,0.3,0
+5.4,3.4,1.7,0.2,0
+5.1,3.7,1.5,0.4,0
+4.6,3.6,1.0,0.2,0
+5.1,3.3,1.7,0.5,0
+4.8,3.4,1.9,0.2,0
+5.0,3.0,1.6,0.2,0
+5.0,3.4,1.6,0.4,0
+5.2,3.5,1.5,0.2,0
+5.2,3.4,1.4,0.2,0
+4.7,3.2,1.6,0.2,0
+4.8,3.1,1.6,0.2,0
+5.4,3.4,1.5,0.4,0
+5.2,4.1,1.5,0.1,0
+5.5,4.2,1.4,0.2,0
+4.9,3.1,1.5,0.1,0
+5.0,3.2,1.2,0.2,0
+5.5,3.5,1.3,0.2,0
+4.9,3.1,1.5,0.1,0
+4.4,3.0,1.3,0.2,0
+5.1,3.4,1.5,0.2,0
+5.0,3.5,1.3,0.3,0
+4.5,2.3,1.3,0.3,0
+4.4,3.2,1.3,0.2,0
+5.0,3.5,1.6,0.6,0
+5.1,3.8,1.9,0.4,0
+5.7,2.8,4.5,1.3,1
+6.3,3.3,4.7,1.6,1
+4.9,2.4,3.3,1.0,1
+6.6,2.9,4.6,1.3,1
+5.2,2.7,3.9,1.4,1
+5.0,2.0,3.5,1.0,1
+5.9,3.0,4.2,1.5,1
+6.0,2.2,4.0,1.0,1
+6.1,2.9,4.7,1.4,1
+5.6,2.9,3.6,1.3,1
+6.7,3.1,4.4,1.4,1
+5.6,3.0,4.5,1.5,1
+5.8,2.7,4.1,1.0,1
+6.2,2.2,4.5,1.5,1
+5.6,2.5,3.9,1.1,1
+5.9,3.2,4.8,1.8,1
+6.1,2.8,4.0,1.3,1
+6.3,2.5,4.9,1.5,1
+6.1,2.8,4.7,1.2,1
+6.4,2.9,4.3,1.3,1
+6.6,3.0,4.4,1.4,1
+6.8,2.8,4.8,1.4,1
+6.7,3.0,5.0,1.7,1
+6.0,2.9,4.5,1.5,1
+5.7,2.6,3.5,1.0,1
+5.5,2.4,3.8,1.1,1
+5.5,2.4,3.7,1.0,1
+5.8,2.7,3.9,1.2,1
+6.0,2.7,5.1,1.6,1
+5.4,3.0,4.5,1.5,1
+6.0,3.4,4.5,1.6,1
+6.7,3.1,4.7,1.5,1
+6.3,2.3,4.4,1.3,1
+5.6,3.0,4.1,1.3,1
+5.5,2.5,4.0,1.3,1
+5.5,2.6,4.4,1.2,1
+6.1,3.0,4.6,1.4,1
+5.8,2.6,4.0,1.2,1
+5.0,2.3,3.3,1.0,1
+5.6,2.7,4.2,1.3,1
+5.7,3.0,4.2,1.2,1
+5.7,2.9,4.2,1.3,1
+6.2,2.9,4.3,1.3,1
+5.1,2.5,3.0,1.1,1
+5.7,2.8,4.1,1.3,1
diff --git a/tests/experiment_client_test.py b/tests/experiment_client_test.py
diff --git a/tests/run.py b/tests/run.py