Skip to content
This repository was archived by the owner on Jun 30, 2022. It is now read-only.

Commit 78bf675

Browse files
author
Piotr Plonski
committed
validation update
1 parent e24ed0c commit 78bf675

7 files changed

Lines changed: 102 additions & 25 deletions

File tree

mljar/client/dataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def _wait_till_all_datasets_are_valid(self):
102102

103103

104104

105-
def add_dataset_if_not_exists(self, X, y):
105+
def add_dataset_if_not_exists(self, X, y, title_prefix = 'dataset-'):
106106
'''
107107
Checks if dataset already exists, if not it add dataset to project.
108108
'''
@@ -119,7 +119,7 @@ def add_dataset_if_not_exists(self, X, y):
119119
# dataset with specified hash does not exist
120120
if len(dataset_details) != 1:
121121
# add new dataset
122-
dataset_details = self.add_new_dataset(data, y)
122+
dataset_details = self.add_new_dataset(data, y, title_prefix)
123123
else:
124124
dataset_details = dataset_details[0]
125125

@@ -149,9 +149,9 @@ def _accept_dataset_column_usage(self, dataset_hid):
149149
return response.status_code == 200
150150

151151

152-
def add_new_dataset(self, data, y):
152+
def add_new_dataset(self, data, y, title_prefix = 'dataset-'):
153153
logger.info('Add new dataset')
154-
title = 'dataset-' + str(uuid.uuid4())[:4] # set some random name
154+
title = title_prefix + str(uuid.uuid4())[:4] # set some random name
155155
file_path = '/tmp/dataset-'+ str(uuid.uuid4())[:8]+'.csv'
156156

157157
logger.info('Compress data before export')

mljar/mljar.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import numpy as np
77

88
from utils import *
9-
from exceptions import BadValueException, IncorrectInputDataException, UndefinedExperimentException
9+
from exceptions import IncorrectInputDataException, UndefinedExperimentException
10+
from exceptions import MljarException, BadValueException
1011

1112
from client.project import ProjectClient
1213
from client.dataset import DatasetClient
@@ -157,15 +158,15 @@ def _start_experiment(self, X, y, validation_data = None):
157158
# add a dataset to project
158159
#
159160
logger.info('MLJAR: add training dataset')
160-
self.dataset = DatasetClient(self.project.hid).add_dataset_if_not_exists(X, y)
161+
self.dataset = DatasetClient(self.project.hid).add_dataset_if_not_exists(X, y, title_prefix = 'Training-')
161162

162163
self.dataset_vald = None
163164
if validation_data is not None:
164-
if len(validation_data) == 2:
165+
if len(validation_data) != 2:
165166
raise MljarException('Wrong format of validation data. It should be tuple (X,y)')
166167
logger.info('MLJAR: add validation dataset')
167168
X_vald, y_vald = validation_data
168-
self.dataset_vald = DatasetClient(self.project.hid).add_dataset_if_not_exists(X_vald, y_vald)
169+
self.dataset_vald = DatasetClient(self.project.hid).add_dataset_if_not_exists(X_vald, y_vald, title_prefix = 'Validation-')
169170
#
170171
# add experiment to project
171172
#

mljar/model/project.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ def make_project_instance(self, data):
2929
class Project(BaseModel):
3030
schema = ProjectSchema(strict=True)
3131

32-
def __init__(self, hid, title, description, task, hardware, scope, info, created_at, created_by,
33-
experiments_cnt, models_cnt, datasets, topalg,
34-
compute_now, insights, total_timelog = 0):
32+
def __init__(self, hid, title, description, task, hardware, scope, created_at, created_by,
33+
models_cnt, compute_now, experiments_cnt = None, datasets = None, topalg = None,
34+
insights = None, total_timelog = 0, info = None):
3535
self.hid = hid
3636
self.title = title
3737
self.description = description

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
setup(
1212
name='mljar',
13-
version='0.0.5',
13+
version='0.0.6',
1414
description='Python wrapper over MLJAR API',
1515
long_description=long_description,
1616
url='https://github.com/mljar/mljar-api-python',

tests/mljar_test.py

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def test_basic_usage(self):
4242
'''
4343
model = Mljar(project = self.proj_title, experiment = self.expt_title,
4444
algorithms = ['xgb'], metric='logloss',
45-
validation='3fold', tuning_mode='Normal')
45+
validation_kfolds=3, tuning_mode='Normal')
4646
self.assertTrue(model is not None)
4747
# fit models and wait till all models are trained
4848
model.fit(X = self.X, y = self.y)
@@ -52,6 +52,72 @@ def test_basic_usage(self):
5252
score = self.mse(pred, self.y)
5353
self.assertTrue(score < 0.1)
5454

55+
def test_usage_with_defaults(self):
56+
'''
57+
Test usage with defaults.
58+
'''
59+
model = Mljar(project = self.proj_title, experiment = self.expt_title)
60+
self.assertTrue(model is not None)
61+
# fit models and wait till all models are trained
62+
model.fit(X = self.X, y = self.y, wait_till_all_done = False)
63+
# wait some time
64+
time.sleep(120) # wait a little longer - there are a lot of models
65+
# run prediction
66+
pred = model.predict(self.X)
67+
# get MSE
68+
score = self.mse(pred, self.y)
69+
self.assertTrue(score < 0.5)
70+
# check default validation
71+
self.assertEqual(model.selected_algorithm.validation_scheme, "5-fold CV, Shuffle, Stratify")
72+
73+
def test_usage_with_train_split(self):
74+
'''
75+
Test usage with train split.
76+
'''
77+
model = Mljar(project = self.proj_title, experiment = self.expt_title,
78+
validation_train_split = 0.8, algorithms = ['xgb'], tuning_mode='Normal')
79+
self.assertTrue(model is not None)
80+
# fit models and wait till all models are trained
81+
model.fit(X = self.X, y = self.y, wait_till_all_done = False)
82+
# wait some time
83+
time.sleep(60)
84+
# run prediction
85+
pred = model.predict(self.X)
86+
# get MSE
87+
score = self.mse(pred, self.y)
88+
self.assertTrue(score < 0.5)
89+
# check default validation
90+
self.assertEqual(model.selected_algorithm.validation_scheme, "Split 80/20, Shuffle, Stratify")
91+
92+
93+
def test_usage_with_validation_dataset(self):
94+
'''
95+
Test usage with validation dataset.
96+
'''
97+
model = Mljar(project = self.proj_title, experiment = self.expt_title,
98+
algorithms = ['xgb'], tuning_mode='Normal')
99+
self.assertTrue(model is not None)
100+
# load validation data
101+
df = pd.read_csv('tests/data/test_1_vald.csv')
102+
cols = ['sepal length', 'sepal width', 'petal length', 'petal width']
103+
target = 'class'
104+
X_vald = df[cols]
105+
y_vald = df[target]
106+
# fit models and wait till all models are trained
107+
model.fit(X = self.X, y = self.y, validation_data=(X_vald, y_vald), wait_till_all_done = False)
108+
# wait some time
109+
time.sleep(80)
110+
# run prediction
111+
pred = model.predict(self.X)
112+
# get MSE
113+
score = self.mse(pred, self.y)
114+
self.assertTrue(score < 0.5)
115+
# check default validation
116+
self.assertEqual(model.selected_algorithm.validation_scheme, "With dataset")
117+
118+
119+
120+
55121
def test_empty_project_title(self):
56122
with self.assertRaises(BadValueException) as context:
57123
model = Mljar(project = '', experiment = '')
@@ -86,7 +152,7 @@ def test_non_wait_fit(self):
86152
'''
87153
model = Mljar(project = self.proj_title, experiment = self.expt_title,
88154
algorithms = ['xgb'], metric='logloss',
89-
validation='3fold', tuning_mode='Normal')
155+
validation_kfolds=3, tuning_mode='Normal')
90156
self.assertTrue(model is not None)
91157
# fit models, just start computation and do not wait
92158
start_time = time.time()
@@ -124,7 +190,7 @@ def test_retrive_models(self):
124190
'''
125191
model = Mljar(project = self.proj_title, experiment = self.expt_title,
126192
algorithms = ['xgb'], metric='logloss',
127-
validation='3fold', tuning_mode='Normal')
193+
validation_kfolds=3, tuning_mode='Normal')
128194
self.assertTrue(model is not None)
129195
# fit models and wait till all models are trained
130196
model.fit(X = self.X, y = self.y)
@@ -153,7 +219,7 @@ def test_retrive_models(self):
153219
start_time = time.time()
154220
model_2 = Mljar(project = self.proj_title, experiment = self.expt_title,
155221
algorithms = ['xgb'], metric='logloss',
156-
validation='3fold', tuning_mode='Normal')
222+
validation_kfolds=3, tuning_mode='Normal')
157223
self.assertTrue(model_2 is not None)
158224
# re-use trained models
159225
model_2.fit(X = self.X, y = self.y)
@@ -184,3 +250,6 @@ def test_basic_usage_with_defaults(self):
184250
score = self.mse(pred, self.y)
185251
self.assertTrue(score < 0.1)
186252
'''
253+
254+
if __name__ == "__main__":
255+
unittest.main()

tests/result_client_test.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@ def setUp(self):
2020
proj_title = 'Test project-01'
2121
proj_task = 'bin_class'
2222
self.expt_title = 'Test experiment-01'
23-
self.validation = '5fold'
23+
self.validation_kfolds = 5
24+
self.validation_shuffle = True
25+
self.validation_stratify = True
26+
self.validation_train_split = None
2427
self.algorithms = ['xgb']
2528
self.metric = 'logloss'
2629
self.tuning_mode = 'Normal'
@@ -60,8 +63,10 @@ def test_get_results_for_project(self):
6063
# add experiment
6164
ec = ExperimentClient(self.project.hid)
6265
# create new experiment
63-
self.experiment = ec.add_experiment_if_not_exists(self.dataset, self.expt_title, self.project.task,
64-
self.validation, self.algorithms, self.metric,
66+
self.experiment = ec.add_experiment_if_not_exists(self.dataset, None, self.expt_title, self.project.task,
67+
self.validation_kfolds, self.validation_shuffle,
68+
self.validation_stratify, self.validation_train_split,
69+
self.algorithms, self.metric,
6570
self.tuning_mode, self.time_constraint, self.create_enseble)
6671
# wait some time till models are initialized
6772
time.sleep(60)
@@ -83,8 +88,10 @@ def test_get_results_for_experiment(self):
8388
# add experiment
8489
ec = ExperimentClient(self.project.hid)
8590
# create new experiment
86-
self.experiment = ec.add_experiment_if_not_exists(self.dataset, self.expt_title, self.project.task,
87-
self.validation, self.algorithms, self.metric,
91+
self.experiment = ec.add_experiment_if_not_exists(self.dataset, None, self.expt_title, self.project.task,
92+
self.validation_kfolds, self.validation_shuffle,
93+
self.validation_stratify, self.validation_train_split,
94+
self.algorithms, self.metric,
8895
self.tuning_mode, self.time_constraint, self.create_enseble)
8996
# wait some time till models are initialized
9097
time.sleep(60)

tests/run.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
import os
55
import unittest
66

7-
#from project_client_test import ProjectClientTest
8-
#from dataset_client_test import DatasetClientTest
7+
from project_client_test import ProjectClientTest
8+
from dataset_client_test import DatasetClientTest
99
from experiment_client_test import ExperimentClientTest
10-
#from result_client_test import ResultClientTest
11-
#from mljar_test import MljarTest
10+
from result_client_test import ResultClientTest
11+
from mljar_test import MljarTest
1212

1313
if __name__ == '__main__':
1414
unittest.main()

0 commit comments

Comments
 (0)