Skip to content

Commit 18b8f97

Browse files
committed
added functionality that checks whether a run already exists on the server (and potentially does not execute it).
For this, the notion of setups is introduced (new module) and the order of run execution is changed back to a previous state (first the flow is constructed and downloaded, then the run is executed) A boolean config item "avoid_duplicate_runs" is added
1 parent bc8895b commit 18b8f97

5 files changed

Lines changed: 136 additions & 61 deletions

File tree

openml/config.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def _setup():
3636
"""
3737
global apikey
3838
global server
39+
global avoid_duplicate_runs
3940
# read config file, create cache directory
4041
try:
4142
os.mkdir(os.path.expanduser('~/.openml'))
@@ -46,6 +47,7 @@ def _setup():
4647
apikey = config.get('FAKE_SECTION', 'apikey')
4748
server = config.get('FAKE_SECTION', 'server')
4849
cache_dir = config.get('FAKE_SECTION', 'cachedir')
50+
avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs')
4951
set_cache_directory(cache_dir)
5052

5153

@@ -84,7 +86,8 @@ def _parse_config():
8486
defaults = {'apikey': apikey,
8587
'server': server,
8688
'verbosity': 0,
87-
'cachedir': os.path.expanduser('~/.openml/cache')}
89+
'cachedir': os.path.expanduser('~/.openml/cache'),
90+
'avoid_duplicate_runs': 'True'}
8891

8992
config_file = os.path.expanduser('~/.openml/config')
9093
config = configparser.RawConfigParser(defaults=defaults)

openml/runs/functions.py

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88

99
from build.lib.openml.exceptions import PyOpenMLError
1010
from .. import config
11-
from ..flows import sklearn_to_flow
11+
from ..flows import sklearn_to_flow, get_flow
12+
from ..setups import setup_exists
1213
from ..exceptions import OpenMLCacheException
1314
from ..util import URLError
1415
from ..tasks.functions import _create_task_from_xml
@@ -42,6 +43,20 @@ def run_task(task, model):
4243
# TODO move this into its onwn module. While it somehow belongs here, it
4344
# adds quite a lot of functionality which is better suited in other places!
4445
# TODO why doesn't this accept a flow as input? - this would make this more flexible!
46+
flow = sklearn_to_flow(model)
47+
flow_id = flow._ensure_flow_exists()
48+
if flow_id < 0:
49+
print("No flow")
50+
return 0, 2
51+
config.logger.info(flow_id)
52+
53+
if config.avoid_duplicate_runs:
54+
# TODO: would be nice if flow._ensure_flow_exists already handled this
55+
flow = get_flow(flow_id)
56+
setup_id = setup_exists(flow, model)
57+
ids = _run_exists(task.task_id, setup_id)
58+
if ids:
59+
raise PyOpenMLError("Run already exists in server. Run id(s): %s" %str(ids))
4560

4661
dataset = task.get_dataset()
4762
X, Y = dataset.get_data(target=task.target_name)
@@ -52,27 +67,34 @@ def run_task(task, model):
5267
'only works for tasks with class labels.')
5368

5469
# execute the run
55-
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
70+
run = OpenMLRun(task_id=task.task_id, flow_id=flow_id, dataset_id=dataset.dataset_id, model=model)
5671

5772
try:
5873
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
5974
except PyOpenMLError as message:
6075
run.error_message = str(message)
6176
warnings.warn("Run terminated with error: %s" %run.error_message)
6277

63-
# now generate the flow
64-
flow = sklearn_to_flow(model)
65-
flow_id = flow._ensure_flow_exists()
66-
if flow_id < 0:
67-
print("No flow")
68-
return 0, 2
69-
config.logger.info(flow_id)
70-
71-
# attach the flow to the run
72-
run.flow_id = flow_id
73-
7478
return run
7579

80+
def _run_exists(task_id, setup_id):
81+
'''
82+
Checks whether a task/setup combination is already present on the server.
83+
84+
:param task_id: int
85+
:param setup_id: int
86+
:return: List of run ids iff these already exists on the server, False otherwise
87+
'''
88+
if setup_id <= 0:
89+
# openml setups are in range 1-inf
90+
return False
91+
92+
result = list_runs(task=[task_id], setup=[setup_id])
93+
if len(result) > 0:
94+
return set(result.keys())
95+
else:
96+
return False
97+
7698

7799
def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label, predicted_probabilities, class_labels, model_classes_mapping):
78100
"""Complicated util function that turns probability estimates of a classifier for a given instance into the right arff format to upload to openml.
@@ -329,7 +351,7 @@ def _get_cached_run(run_id):
329351
"cached" % run_id)
330352

331353

332-
def list_runs(offset=None, size=None, id=None, task=None,
354+
def list_runs(offset=None, size=None, id=None, task=None, setup=None,
333355
flow=None, uploader=None, tag=None):
334356
"""List all runs matching all of the given filters.
335357
@@ -346,6 +368,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
346368
347369
task : list, optional
348370
371+
setup: list, optional
372+
349373
flow : list, optional
350374
351375
uploader : list, optional
@@ -367,6 +391,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
367391
api_call += "/run/%s" % ','.join([str(int(i)) for i in id])
368392
if task is not None:
369393
api_call += "/task/%s" % ','.join([str(int(i)) for i in task])
394+
if setup is not None:
395+
api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
370396
if flow is not None:
371397
api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
372398
if uploader is not None:

openml/runs/run.py

Lines changed: 50 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def _create_description_xml(self):
166166
# TODO: don't we have flow object in data structure? Use this one
167167
downloaded_flow = openml.flows.get_flow(self.flow_id)
168168

169-
openml_param_settings = _parse_parameters(self.model, downloaded_flow)
169+
openml_param_settings = OpenMLRun._parse_parameters(self.model, downloaded_flow)
170170

171171
# as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
172172
# so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
@@ -182,55 +182,59 @@ def _create_description_xml(self):
182182
description_xml = xmltodict.unparse(description, pretty=True)
183183
return description_xml
184184

185-
def _parse_parameters(model, flow):
186-
"""Extracts all parameter settings from a model in OpenML format.
185+
@staticmethod
186+
def _parse_parameters(model, flow):
187+
"""Extracts all parameter settings from a model in OpenML format.
187188
188-
Parameters
189-
----------
190-
model
191-
the scikit-learn model (fitted)
192-
flow
193-
openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
189+
Parameters
190+
----------
191+
model
192+
the scikit-learn model (fitted)
193+
flow
194+
openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
194195
195-
"""
196-
python_param_settings = model.get_params()
197-
openml_param_settings = []
198-
199-
def get_flow_dict(_flow):
200-
flow_map = {_flow.name: _flow.flow_id}
201-
for subflow in _flow.components:
202-
flow_map.update(get_flow_dict(_flow.components[subflow]))
203-
return flow_map
204-
205-
flow_dict = get_flow_dict(flow)
206-
207-
for param in python_param_settings:
208-
if "__" in param:
209-
# parameter of subflow. will be handled later
210-
continue
211-
if isinstance(python_param_settings[param], BaseEstimator):
212-
# extract parameters of the subflow individually
213-
subflow = flow.components[param]
214-
openml_param_settings += _parse_parameters(python_param_settings[param], subflow)
215-
216-
# add parameter setting (in some cases also the subflow. Just because we can)
217-
if param in flow.parameters.keys():
218-
param_dict = OrderedDict()
219-
param_dict['oml:name'] = param
220-
param_dict['oml:value'] = str(python_param_settings[param])
221-
param_dict['oml:component'] = flow_dict[flow.name]
222-
openml_param_settings.append(param_dict)
223-
else:
224-
if flow.name.startswith("sklearn.pipeline.Pipeline"):
225-
# tolerate
226-
pass
227-
elif flow.name.startswith("sklearn.pipeline.FeatureUnion"):
228-
# tolerate
229-
pass
196+
"""
197+
if flow.flow_id is None:
198+
raise ValueError("The flow parameter needs to be downloaded from server")
199+
200+
python_param_settings = model.get_params()
201+
openml_param_settings = []
202+
203+
def get_flow_dict(_flow):
204+
flow_map = {_flow.name: _flow.flow_id}
205+
for subflow in _flow.components:
206+
flow_map.update(get_flow_dict(_flow.components[subflow]))
207+
return flow_map
208+
209+
flow_dict = get_flow_dict(flow)
210+
211+
for param in python_param_settings:
212+
if "__" in param:
213+
# parameter of subflow. will be handled later
214+
continue
215+
if isinstance(python_param_settings[param], BaseEstimator):
216+
# extract parameters of the subflow individually
217+
subflow = flow.components[param]
218+
openml_param_settings += OpenMLRun._parse_parameters(python_param_settings[param], subflow)
219+
220+
# add parameter setting (in some cases also the subflow. Just because we can)
221+
if param in flow.parameters.keys():
222+
param_dict = OrderedDict()
223+
param_dict['oml:name'] = param
224+
param_dict['oml:value'] = str(python_param_settings[param])
225+
param_dict['oml:component'] = flow_dict[flow.name]
226+
openml_param_settings.append(param_dict)
230227
else:
231-
raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name))
228+
if flow.name.startswith("sklearn.pipeline.Pipeline"):
229+
# tolerate
230+
pass
231+
elif flow.name.startswith("sklearn.pipeline.FeatureUnion"):
232+
# tolerate
233+
pass
234+
else:
235+
raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name))
232236

233-
return openml_param_settings
237+
return openml_param_settings
234238

235239
################################################################################
236240
# Functions which cannot be in runs/functions due to circular imports

openml/setups/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .functions import setup_exists
2+
3+
__all__ = ['setup_exists']

openml/setups/functions.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import openml
2+
import xmltodict
3+
4+
from collections import OrderedDict
5+
6+
def setup_exists(downloaded_flow, sklearn_model):
7+
'''
8+
Checks whether a flow / hyperparameter configuration already exists on the server
9+
10+
:param downloaded_flow:
11+
the openml flow object (should be downloaded from server.
12+
Otherwise also give flow id parameter)
13+
:param sklearn_model: obvious
14+
:param flow_id: int
15+
:return: int setup id iff exists, False otherwise
16+
'''
17+
18+
# sadly, this api call relies on a run object
19+
openml_param_settings = openml.runs.OpenMLRun._parse_parameters(sklearn_model, downloaded_flow)
20+
description = xmltodict.unparse(_to_dict(downloaded_flow.flow_id, openml_param_settings), pretty=True)
21+
file_elements = {'description': ('description.arff',description)}
22+
23+
result = openml._api_calls._perform_api_call('/setup/exists/',
24+
file_elements = file_elements)
25+
result_dict = xmltodict.parse(result)
26+
if 'oml:id' in result_dict['oml:setup_exists']:
27+
return int(result_dict['oml:setup_exists']['oml:id'])
28+
else:
29+
return False
30+
31+
32+
def _to_dict(flow_id, openml_parameter_settings):
33+
xml = OrderedDict()
34+
xml['oml:run'] = OrderedDict()
35+
xml['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
36+
xml['oml:run']['oml:flow_id'] = flow_id
37+
xml['oml:run']['oml:parameter_setting'] = openml_parameter_settings
38+
39+
return xml

0 commit comments

Comments
 (0)