Skip to content
This repository was archived by the owner on Jun 30, 2022. It is now read-only.

Commit e24ed0c

Browse files
author
Piotr Plonski
committed
add validations
1 parent f576f99 commit e24ed0c

6 files changed

Lines changed: 260 additions & 53 deletions

File tree

mljar/client/experiment.py

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import warnings
23
from base import MljarHttpClient
34
from ..model.experiment import Experiment
45
from ..exceptions import NotFoundException, MljarException, CreateExperimentException
@@ -8,7 +9,7 @@
89
from ..log import logger
910

1011
from ..utils import make_hash
11-
from ..utils import MLJAR_VALIDATIONS, MLJAR_METRICS, MLJAR_TUNING_MODES, MLJAR_DEFAULT_ALGORITHMS, MLJAR_DEFAULT_METRICS
12+
from ..utils import MLJAR_METRICS, MLJAR_TUNING_MODES, MLJAR_DEFAULT_ALGORITHMS, MLJAR_DEFAULT_METRICS
1213

1314
class ExperimentClient(MljarHttpClient):
1415
'''
@@ -45,13 +46,32 @@ def create_experiment(self, data):
4546
raise CreateExperimentException()
4647
return Experiment.from_dict(response.json())
4748

48-
def add_experiment_if_not_exists(self, train_dataset, experiment_title, project_task, \
49-
validation, algorithms, metric, \
49+
def add_experiment_if_not_exists(self, train_dataset, vald_dataset, experiment_title, project_task, \
50+
validation_kfolds, validation_shuffle, \
51+
validation_stratify, validation_train_split, \
52+
algorithms, metric, \
5053
tuning_mode, time_constraint, create_ensemble):
5154
logger.info('Add experiment if not exists')
5255
# parameters validation
53-
if validation is None or validation == '' or validation not in MLJAR_VALIDATIONS:
54-
validation = MLJAR_DEFAULT_VALIDATION
56+
# validation with dataset
57+
if vald_dataset is not None:
58+
validation = "With dataset"
59+
else:
60+
# do train/validation split
61+
if validation_train_split is not None:
62+
percents = int(validation_train_split * 100.0)
63+
validation = "Split {}/{}".format(percents, 100-percents)
64+
else:
65+
validation = "{}-fold CV".format(validation_kfolds)
66+
67+
# shuffle and stratify
68+
if validation_shuffle:
69+
validation += ", Shuffle"
70+
if validation_stratify and project_task == 'bin_class':
71+
validation += ", Stratify"
72+
if validation_stratify and project_task != 'bin_class':
73+
warnings.warn('Cannot use stratify in validation for your project task. Omitting this option in validation.')
74+
5575
if metric is None or metric == '' or metric not in MLJAR_METRICS:
5676
metric = MLJAR_DEFAULT_METRICS[project_task]
5777
if tuning_mode is None or tuning_mode == '' or tuning_mode not in MLJAR_TUNING_MODES:
@@ -67,18 +87,22 @@ def add_experiment_if_not_exists(self, train_dataset, experiment_title, project_
6787
dataset_preproc['convert_categorical'] = 'categorical_to_int'
6888
# create stub for new experiment
6989
logger.info('Create new experiment stub')
90+
expt_params = {
91+
"train_dataset": {"id": train_dataset.hid, 'title': train_dataset.title},
92+
"algs":algorithms,
93+
"preproc": dataset_preproc,
94+
"single_limit":time_constraint,
95+
"ensemble":create_ensemble,
96+
"random_start_cnt": MLJAR_TUNING_MODES[tuning_mode]['random_start_cnt'],
97+
"hill_climbing_cnt": MLJAR_TUNING_MODES[tuning_mode]['hill_climbing_cnt']
98+
}
99+
if vald_dataset is not None:
100+
expt_params['vald_dataset'] = {"id": vald_dataset.hid, 'title': vald_dataset.title}
101+
70102
new_expt = Experiment(hid='', title=experiment_title, models_cnt=0, task=project_task,
71103
description='', metric=metric, validation_scheme=validation,
72104
total_timelog=0, bestalg=[], details={},
73-
params={
74-
"train_dataset": {"id": train_dataset.hid, 'title': train_dataset.title},
75-
"algs":algorithms,
76-
"preproc": dataset_preproc,
77-
"single_limit":time_constraint,
78-
"ensemble":create_ensemble,
79-
"random_start_cnt": MLJAR_TUNING_MODES[tuning_mode]['random_start_cnt'],
80-
"hill_climbing_cnt": MLJAR_TUNING_MODES[tuning_mode]['hill_climbing_cnt']
81-
},
105+
params=expt_params,
82106
compute_now=0, computation_started_at=None, created_at=None,
83107
created_by=None, parent_project=self.project_hid)
84108

mljar/mljar.py

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@ class Mljar(object):
2626
def __init__(self, project, experiment,
2727
metric = '',
2828
algorithms = [],
29-
validation = MLJAR_DEFAULT_VALIDATION,
29+
validation_kfolds = MLJAR_DEFAULT_FOLDS,
30+
validation_shuffle = MLJAR_DEFAULT_SHUFFLE,
31+
validation_stratify = MLJAR_DEFAULT_STRATIFY,
32+
validation_train_split = MLJAR_DEFAULT_TRAIN_SPLIT,
3033
tuning_mode = MLJAR_DEFAULT_TUNING_MODE,
3134
create_ensemble = MLJAR_DEFAULT_ENSEMBLE,
3235
single_algorithm_time_limit = MLJAR_DEFAULT_TIME_CONSTRAINT):
@@ -60,12 +63,20 @@ def __init__(self, project, experiment,
6063
- rmse which is Root Mean Square Error
6164
- mse which is for Mean Square Error
6265
- mase which is for Mean Absolute Error
63-
validation: The schema of validation that will be used for model search and tuning. There is only available
64-
validation with cross validation. Proper values are:
65-
- 3fold for 3-fold Stratified CV
66-
- 5fold for 5-fold Stratified CV
67-
- 10fold for 10-fold Stratified CV
68-
The default is 5-fold CV.
66+
validation_kfolds: The number of folds to be used in validation,
67+
it is omitted if validation_train_split is not None
68+
or there is validation dataset provided.
69+
It can be number from 2 to 15.
70+
validation_shuffle: The boolean which specify if shuffle samples before training.
71+
It is used in k-fold CV and in validation split. Default is set True.
72+
It is ignored when validating with separate dataset.
73+
validation_stratify: The boolean which decides whether samples will be
74+
divided into folds with the same class distribution.
75+
In regression tasks this flag is ignored. Default is set to True.
76+
validation_train_split: The ratio how to split training dataset into train and validation.
77+
This ratio specify what ratio from input data should be used in training.
78+
It should be from (0.05,0.95) range. If it is not None, then
79+
validation_kfolds variable is ignored.
6980
single_algorithm_time_limit: The time in minutes that will be spend for training single algorithm.
7081
Default value is 5 minutes.
7182
'''
@@ -89,19 +100,34 @@ def __init__(self, project, experiment,
89100
# below params are validated later
90101
self.algorithms = algorithms
91102
self.metric = metric
92-
self.validation = validation
93103
self.single_algorithm_time_limit = single_algorithm_time_limit
94104
self.wait_till_all_done = True
95105
self.selected_algorithm = None
96106
self.project = None
97107
self.experiment = None
98108

99-
def fit(self, X, y, wait_till_all_done = True):
109+
self.validation_kfolds = validation_kfolds
110+
self.validation_shuffle = validation_shuffle
111+
self.validation_stratify = validation_stratify
112+
self.validation_train_split = validation_train_split
113+
114+
if self.validation_kfolds is not None:
115+
if self.validation_kfolds < 2 or self.validation_kfolds > 15:
116+
raise MljarException('Wrong validation_kfolds parameter value, it should be in [2, 15] range.')
117+
118+
if self.validation_train_split is not None:
119+
if self.validation_train_split < 0.05 or self.validation_train_split > 0.95:
120+
raise MljarException('Wrong validation_train_split parameter value, it should be in (0.05, 0.95) range.')
121+
122+
123+
def fit(self, X, y, validation_data = None, wait_till_all_done = True):
100124
'''
101125
Fit models with MLJAR engine.
102126
Args:
103127
X: The numpy or pandas matrix with training data.
104128
y: The numpy or pandas vector with target values.
129+
validation_data: Tuple (X,y) with validation data.If set to None, then
130+
the k-fold CV or train split validation will be used.
105131
wait_till_all_done: The flag which decides if fit function will wait
106132
till experiment is done.
107133
'''
@@ -113,12 +139,12 @@ def fit(self, X, y, wait_till_all_done = True):
113139
raise IncorrectInputDataException('Sorry, there is a missmatch between X and y matrices shapes')
114140

115141
try:
116-
self._start_experiment(X, y)
142+
self._start_experiment(X, y, validation_data)
117143
except Exception as e:
118144
print 'Ups, %s' % str(e)
119145

120146

121-
def _start_experiment(self, X, y):
147+
def _start_experiment(self, X, y, validation_data = None):
122148

123149
# define project task
124150
self.project_task = 'bin_class' if len(np.unique(y)) == 2 else 'reg'
@@ -130,14 +156,25 @@ def _start_experiment(self, X, y):
130156
#
131157
# add a dataset to project
132158
#
133-
logger.info('MLJAR: add dataset')
159+
logger.info('MLJAR: add training dataset')
134160
self.dataset = DatasetClient(self.project.hid).add_dataset_if_not_exists(X, y)
161+
162+
self.dataset_vald = None
163+
if validation_data is not None:
164+
if len(validation_data) == 2:
165+
raise MljarException('Wrong format of validation data. It should be tuple (X,y)')
166+
logger.info('MLJAR: add validation dataset')
167+
X_vald, y_vald = validation_data
168+
self.dataset_vald = DatasetClient(self.project.hid).add_dataset_if_not_exists(X_vald, y_vald)
135169
#
136170
# add experiment to project
137171
#
138172
logger.info('MLJAR: add experiment')
139-
self.experiment = ExperimentClient(self.project.hid).add_experiment_if_not_exists(self.dataset, self.experiment_title, self.project_task, \
140-
self.validation, self.algorithms, self.metric, \
173+
self.experiment = ExperimentClient(self.project.hid).add_experiment_if_not_exists(self.dataset, self.dataset_vald, \
174+
self.experiment_title, self.project_task, \
175+
self.validation_kfolds, self.validation_shuffle, \
176+
self.validation_stratify, self.validation_train_split, \
177+
self.algorithms, self.metric, \
141178
self.tuning_mode, self.single_algorithm_time_limit, self.create_ensemble)
142179
if self.experiment is None:
143180
raise UndefinedExperimentException()

mljar/utils.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,11 @@
1818
'mae' : 'Mean Absolute Error'
1919
}
2020

21-
MLJAR_VALIDATIONS = {
22-
"3fold" : "3-fold Stratified CV",
23-
"5fold" : "5-fold Stratified CV",
24-
"10fold": "10-fold Stratified CV",
25-
"with_validation": "validation dataset"
26-
}
21+
MLJAR_DEFAULT_FOLDS = 5
22+
MLJAR_DEFAULT_SHUFFLE = True
23+
MLJAR_DEFAULT_STRATIFY = True
24+
MLJAR_DEFAULT_TRAIN_SPLIT = None
25+
2726

2827
MLJAR_BIN_CLASS = {
2928
"xgb" :"Extreme Gradient Boosting",
@@ -63,7 +62,6 @@
6362
'regression': ['xgbr', 'lgbr']
6463
}
6564

66-
MLJAR_DEFAULT_VALIDATION = '5fold'
6765
MLJAR_DEFAULT_ENSEMBLE = True
6866
MLJAR_DEFAULT_TUNING_MODE = 'Normal'
6967
MLJAR_DEFAULT_TIME_CONSTRAINT = '5' # minutes

tests/data/test_1_vald.csv

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
sepal length,sepal width,petal length,petal width,class
2+
5.1,3.5,1.4,0.2,0
3+
4.9,3.0,1.4,0.2,0
4+
4.7,3.2,1.3,0.2,0
5+
4.6,3.1,1.5,0.2,0
6+
5.0,3.6,1.4,0.2,0
7+
5.4,3.9,1.7,0.4,0
8+
4.6,3.4,1.4,0.3,0
9+
5.0,3.4,1.5,0.2,0
10+
4.4,2.9,1.4,0.2,0
11+
4.9,3.1,1.5,0.1,0
12+
5.4,3.7,1.5,0.2,0
13+
4.8,3.4,1.6,0.2,0
14+
4.8,3.0,1.4,0.1,0
15+
4.3,3.0,1.1,0.1,0
16+
5.8,4.0,1.2,0.2,0
17+
5.7,4.4,1.5,0.4,0
18+
5.4,3.9,1.3,0.4,0
19+
5.1,3.5,1.4,0.3,0
20+
5.7,3.8,1.7,0.3,0
21+
5.1,3.8,1.5,0.3,0
22+
5.4,3.4,1.7,0.2,0
23+
5.1,3.7,1.5,0.4,0
24+
4.6,3.6,1.0,0.2,0
25+
5.1,3.3,1.7,0.5,0
26+
4.8,3.4,1.9,0.2,0
27+
5.0,3.0,1.6,0.2,0
28+
5.0,3.4,1.6,0.4,0
29+
5.2,3.5,1.5,0.2,0
30+
5.2,3.4,1.4,0.2,0
31+
4.7,3.2,1.6,0.2,0
32+
4.8,3.1,1.6,0.2,0
33+
5.4,3.4,1.5,0.4,0
34+
5.2,4.1,1.5,0.1,0
35+
5.5,4.2,1.4,0.2,0
36+
4.9,3.1,1.5,0.1,0
37+
5.0,3.2,1.2,0.2,0
38+
5.5,3.5,1.3,0.2,0
39+
4.9,3.1,1.5,0.1,0
40+
4.4,3.0,1.3,0.2,0
41+
5.1,3.4,1.5,0.2,0
42+
5.0,3.5,1.3,0.3,0
43+
4.5,2.3,1.3,0.3,0
44+
4.4,3.2,1.3,0.2,0
45+
5.0,3.5,1.6,0.6,0
46+
5.1,3.8,1.9,0.4,0
47+
5.7,2.8,4.5,1.3,1
48+
6.3,3.3,4.7,1.6,1
49+
4.9,2.4,3.3,1.0,1
50+
6.6,2.9,4.6,1.3,1
51+
5.2,2.7,3.9,1.4,1
52+
5.0,2.0,3.5,1.0,1
53+
5.9,3.0,4.2,1.5,1
54+
6.0,2.2,4.0,1.0,1
55+
6.1,2.9,4.7,1.4,1
56+
5.6,2.9,3.6,1.3,1
57+
6.7,3.1,4.4,1.4,1
58+
5.6,3.0,4.5,1.5,1
59+
5.8,2.7,4.1,1.0,1
60+
6.2,2.2,4.5,1.5,1
61+
5.6,2.5,3.9,1.1,1
62+
5.9,3.2,4.8,1.8,1
63+
6.1,2.8,4.0,1.3,1
64+
6.3,2.5,4.9,1.5,1
65+
6.1,2.8,4.7,1.2,1
66+
6.4,2.9,4.3,1.3,1
67+
6.6,3.0,4.4,1.4,1
68+
6.8,2.8,4.8,1.4,1
69+
6.7,3.0,5.0,1.7,1
70+
6.0,2.9,4.5,1.5,1
71+
5.7,2.6,3.5,1.0,1
72+
5.5,2.4,3.8,1.1,1
73+
5.5,2.4,3.7,1.0,1
74+
5.8,2.7,3.9,1.2,1
75+
6.0,2.7,5.1,1.6,1
76+
5.4,3.0,4.5,1.5,1
77+
6.0,3.4,4.5,1.6,1
78+
6.7,3.1,4.7,1.5,1
79+
6.3,2.3,4.4,1.3,1
80+
5.6,3.0,4.1,1.3,1
81+
5.5,2.5,4.0,1.3,1
82+
5.5,2.6,4.4,1.2,1
83+
6.1,3.0,4.6,1.4,1
84+
5.8,2.6,4.0,1.2,1
85+
5.0,2.3,3.3,1.0,1
86+
5.6,2.7,4.2,1.3,1
87+
5.7,3.0,4.2,1.2,1
88+
5.7,2.9,4.2,1.3,1
89+
6.2,2.9,4.3,1.3,1
90+
5.1,2.5,3.0,1.1,1
91+
5.7,2.8,4.1,1.3,1

0 commit comments

Comments
 (0)