@@ -26,7 +26,10 @@ class Mljar(object):
2626 def __init__ (self , project , experiment ,
2727 metric = '' ,
2828 algorithms = [],
29- validation = MLJAR_DEFAULT_VALIDATION ,
29+ validation_kfolds = MLJAR_DEFAULT_FOLDS ,
30+ validation_shuffle = MLJAR_DEFAULT_SHUFFLE ,
31+ validation_stratify = MLJAR_DEFAULT_STRATIFY ,
32+ validation_train_split = MLJAR_DEFAULT_TRAIN_SPLIT ,
3033 tuning_mode = MLJAR_DEFAULT_TUNING_MODE ,
3134 create_ensemble = MLJAR_DEFAULT_ENSEMBLE ,
3235 single_algorithm_time_limit = MLJAR_DEFAULT_TIME_CONSTRAINT ):
@@ -60,12 +63,20 @@ def __init__(self, project, experiment,
6063 - rmse which is Root Mean Square Error
6164 - mse which is for Mean Square Error
6265 - mase which is for Mean Absolute Error
63- validation: The schema of validation that will be used for model search and tuning. There is only available
64- validation with cross validation. Proper values are:
65- - 3fold for 3-fold Stratified CV
66- - 5fold for 5-fold Stratified CV
67- - 10fold for 10-fold Stratified CV
68- The default is 5-fold CV.
66+ validation_kfolds: The number of folds to be used in validation,
67+ it is omitted if validation_train_split is not None
68+ or there is validation dataset provided.
69+ It can be number from 2 to 15.
70+ validation_shuffle: The boolean which specify if shuffle samples before training.
71+ It is used in k-fold CV and in validation split. Default is set True.
72+ It is ignored when validating with separate dataset.
73+ validation_stratify: The boolean which decides whether samples will be
74+ divided into folds with the same class distribution.
75+ In regression tasks this flag is ignored. Default is set to True.
76+ validation_train_split: The ratio how to split training dataset into train and validation.
77+ This ratio specify what ratio from input data should be used in training.
78+ It should be from (0.05,0.95) range. If it is not None, then
79+ validation_kfolds variable is ignored.
6980 single_algorithm_time_limit: The time in minutes that will be spend for training single algorithm.
7081 Default value is 5 minutes.
7182 '''
@@ -89,19 +100,34 @@ def __init__(self, project, experiment,
89100 # below params are validated later
90101 self .algorithms = algorithms
91102 self .metric = metric
92- self .validation = validation
93103 self .single_algorithm_time_limit = single_algorithm_time_limit
94104 self .wait_till_all_done = True
95105 self .selected_algorithm = None
96106 self .project = None
97107 self .experiment = None
98108
99- def fit (self , X , y , wait_till_all_done = True ):
109+ self .validation_kfolds = validation_kfolds
110+ self .validation_shuffle = validation_shuffle
111+ self .validation_stratify = validation_stratify
112+ self .validation_train_split = validation_train_split
113+
114+ if self .validation_kfolds is not None :
115+ if self .validation_kfolds < 2 or self .validation_kfolds > 15 :
116+ raise MljarException ('Wrong validation_kfolds parameter value, it should be in [2, 15] range.' )
117+
118+ if self .validation_train_split is not None :
119+ if self .validation_train_split < 0.05 or self .validation_train_split > 0.95 :
120+ raise MljarException ('Wrong validation_train_split parameter value, it should be in (0.05, 0.95) range.' )
121+
122+
123+ def fit (self , X , y , validation_data = None , wait_till_all_done = True ):
100124 '''
101125 Fit models with MLJAR engine.
102126 Args:
103127 X: The numpy or pandas matrix with training data.
104128 y: The numpy or pandas vector with target values.
129+ validation_data: Tuple (X,y) with validation data.If set to None, then
130+ the k-fold CV or train split validation will be used.
105131 wait_till_all_done: The flag which decides if fit function will wait
106132 till experiment is done.
107133 '''
@@ -113,12 +139,12 @@ def fit(self, X, y, wait_till_all_done = True):
113139 raise IncorrectInputDataException ('Sorry, there is a missmatch between X and y matrices shapes' )
114140
115141 try :
116- self ._start_experiment (X , y )
142+ self ._start_experiment (X , y , validation_data )
117143 except Exception as e :
118144 print 'Ups, %s' % str (e )
119145
120146
121- def _start_experiment (self , X , y ):
147+ def _start_experiment (self , X , y , validation_data = None ):
122148
123149 # define project task
124150 self .project_task = 'bin_class' if len (np .unique (y )) == 2 else 'reg'
@@ -130,14 +156,25 @@ def _start_experiment(self, X, y):
130156 #
131157 # add a dataset to project
132158 #
133- logger .info ('MLJAR: add dataset' )
159+ logger .info ('MLJAR: add training dataset' )
134160 self .dataset = DatasetClient (self .project .hid ).add_dataset_if_not_exists (X , y )
161+
162+ self .dataset_vald = None
163+ if validation_data is not None :
164+ if len (validation_data ) == 2 :
165+ raise MljarException ('Wrong format of validation data. It should be tuple (X,y)' )
166+ logger .info ('MLJAR: add validation dataset' )
167+ X_vald , y_vald = validation_data
168+ self .dataset_vald = DatasetClient (self .project .hid ).add_dataset_if_not_exists (X_vald , y_vald )
135169 #
136170 # add experiment to project
137171 #
138172 logger .info ('MLJAR: add experiment' )
139- self .experiment = ExperimentClient (self .project .hid ).add_experiment_if_not_exists (self .dataset , self .experiment_title , self .project_task , \
140- self .validation , self .algorithms , self .metric , \
173+ self .experiment = ExperimentClient (self .project .hid ).add_experiment_if_not_exists (self .dataset , self .dataset_vald , \
174+ self .experiment_title , self .project_task , \
175+ self .validation_kfolds , self .validation_shuffle , \
176+ self .validation_stratify , self .validation_train_split , \
177+ self .algorithms , self .metric , \
141178 self .tuning_mode , self .single_algorithm_time_limit , self .create_ensemble )
142179 if self .experiment is None :
143180 raise UndefinedExperimentException ()
0 commit comments