added functionality that checks whether a run already exists on the server (and potentially does not execute it).

janvanrijn · janvanrijn · commit 18b8f9797501 · 2017-03-18T18:25:23.000+01:00
For this, the notion of setups is introduced (new module) and the order of run execution is changed back to a previous state (first the flow is constructed and downloaded, then the run is executed)

A boolean config item "avoid_duplicate_runs" is added
diff --git a/openml/config.py b/openml/config.py
@@ -36,6 +36,7 @@ def _setup():
     """
     global apikey
     global server
+    global avoid_duplicate_runs
     # read config file, create cache directory
     try:
         os.mkdir(os.path.expanduser('~/.openml'))
@@ -46,6 +47,7 @@ def _setup():
     apikey = config.get('FAKE_SECTION', 'apikey')
     server = config.get('FAKE_SECTION', 'server')
     cache_dir = config.get('FAKE_SECTION', 'cachedir')
+    avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs')
     set_cache_directory(cache_dir)
 
 
@@ -84,7 +86,8 @@ def _parse_config():
     defaults = {'apikey': apikey,
                 'server': server,
                 'verbosity': 0,
-                'cachedir': os.path.expanduser('~/.openml/cache')}
+                'cachedir': os.path.expanduser('~/.openml/cache'),
+                'avoid_duplicate_runs': 'True'}
 
     config_file = os.path.expanduser('~/.openml/config')
     config = configparser.RawConfigParser(defaults=defaults)
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -8,7 +8,8 @@
 
 from build.lib.openml.exceptions import PyOpenMLError
 from .. import config
-from ..flows import sklearn_to_flow
+from ..flows import sklearn_to_flow, get_flow
+from ..setups import setup_exists
 from ..exceptions import OpenMLCacheException
 from ..util import URLError
 from ..tasks.functions import _create_task_from_xml
@@ -42,6 +43,20 @@ def run_task(task, model):
     # TODO move this into its onwn module. While it somehow belongs here, it
     # adds quite a lot of functionality which is better suited in other places!
     # TODO why doesn't this accept a flow as input? - this would make this more flexible!
+    flow = sklearn_to_flow(model)
+    flow_id = flow._ensure_flow_exists()
+    if flow_id < 0:
+        print("No flow")
+        return 0, 2
+    config.logger.info(flow_id)
+
+    if config.avoid_duplicate_runs:
+        # TODO: would be nice if flow._ensure_flow_exists already handled this
+        flow = get_flow(flow_id)
+        setup_id = setup_exists(flow, model)
+        ids = _run_exists(task.task_id, setup_id)
+        if ids:
+            raise PyOpenMLError("Run already exists in server. Run id(s): %s" %str(ids))
 
     dataset = task.get_dataset()
     X, Y = dataset.get_data(target=task.target_name)
@@ -52,27 +67,34 @@ def run_task(task, model):
                          'only works for tasks with class labels.')
 
     # execute the run
-    run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
+    run = OpenMLRun(task_id=task.task_id, flow_id=flow_id, dataset_id=dataset.dataset_id, model=model)
 
     try:
         run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
     except PyOpenMLError as message:
         run.error_message = str(message)
         warnings.warn("Run terminated with error: %s" %run.error_message)
 
-    # now generate the flow
-    flow = sklearn_to_flow(model)
-    flow_id = flow._ensure_flow_exists()
-    if flow_id < 0:
-        print("No flow")
-        return 0, 2
-    config.logger.info(flow_id)
-
-    # attach the flow to the run
-    run.flow_id = flow_id
-
     return run
 
+def _run_exists(task_id, setup_id):
+    '''
+    Checks whether a task/setup combination is already present on the server.
+
+    :param task_id: int
+    :param setup_id: int
+    :return: List of run ids iff these already exists on the server, False otherwise
+    '''
+    if setup_id <= 0:
+        # openml setups are in range 1-inf
+        return False
+
+    result = list_runs(task=[task_id], setup=[setup_id])
+    if len(result) > 0:
+        return set(result.keys())
+    else:
+        return False
+
 
 def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label, predicted_probabilities, class_labels, model_classes_mapping):
     """Complicated util function that turns probability estimates of a classifier for a given instance into the right arff format to upload to openml.
@@ -329,7 +351,7 @@ def _get_cached_run(run_id):
                                    "cached" % run_id)
 
 
-def list_runs(offset=None, size=None, id=None, task=None,
+def list_runs(offset=None, size=None, id=None, task=None, setup=None,
               flow=None, uploader=None, tag=None):
     """List all runs matching all of the given filters.
 
@@ -346,6 +368,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
 
     task : list, optional
 
+    setup: list, optional
+
     flow : list, optional
 
     uploader : list, optional
@@ -367,6 +391,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
         api_call += "/run/%s" % ','.join([str(int(i)) for i in id])
     if task is not None:
         api_call += "/task/%s" % ','.join([str(int(i)) for i in task])
+    if setup is not None:
+        api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
     if flow is not None:
         api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
     if uploader is not None:
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -166,7 +166,7 @@ def _create_description_xml(self):
         # TODO: don't we have flow object in data structure? Use this one
         downloaded_flow = openml.flows.get_flow(self.flow_id)
 
-        openml_param_settings = _parse_parameters(self.model, downloaded_flow)
+        openml_param_settings = OpenMLRun._parse_parameters(self.model, downloaded_flow)
 
         # as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
         # so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
@@ -182,55 +182,59 @@ def _create_description_xml(self):
         description_xml = xmltodict.unparse(description, pretty=True)
         return description_xml
 
-def _parse_parameters(model, flow):
-    """Extracts all parameter settings from a model in OpenML format.
+    @staticmethod
+    def _parse_parameters(model, flow):
+        """Extracts all parameter settings from a model in OpenML format.
 
-    Parameters
-    ----------
-    model
-        the scikit-learn model (fitted)
-    flow
-        openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
+        Parameters
+        ----------
+        model
+            the scikit-learn model (fitted)
+        flow
+            openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
 
-    """
-    python_param_settings = model.get_params()
-    openml_param_settings = []
-
-    def get_flow_dict(_flow):
-        flow_map = {_flow.name: _flow.flow_id}
-        for subflow in _flow.components:
-            flow_map.update(get_flow_dict(_flow.components[subflow]))
-        return flow_map
-
-    flow_dict = get_flow_dict(flow)
-
-    for param in python_param_settings:
-        if "__" in param:
-            # parameter of subflow. will be handled later
-            continue
-        if isinstance(python_param_settings[param], BaseEstimator):
-            # extract parameters of the subflow individually
-            subflow = flow.components[param]
-            openml_param_settings += _parse_parameters(python_param_settings[param], subflow)
-
-        # add parameter setting (in some cases also the subflow. Just because we can)
-        if param in flow.parameters.keys():
-            param_dict = OrderedDict()
-            param_dict['oml:name'] = param
-            param_dict['oml:value'] = str(python_param_settings[param])
-            param_dict['oml:component'] = flow_dict[flow.name]
-            openml_param_settings.append(param_dict)
-        else:
-            if flow.name.startswith("sklearn.pipeline.Pipeline"):
-                # tolerate
-                pass
-            elif flow.name.startswith("sklearn.pipeline.FeatureUnion"):
-                # tolerate
-                pass
+        """
+        if flow.flow_id is None:
+            raise ValueError("The flow parameter needs to be downloaded from server")
+
+        python_param_settings = model.get_params()
+        openml_param_settings = []
+
+        def get_flow_dict(_flow):
+            flow_map = {_flow.name: _flow.flow_id}
+            for subflow in _flow.components:
+                flow_map.update(get_flow_dict(_flow.components[subflow]))
+            return flow_map
+
+        flow_dict = get_flow_dict(flow)
+
+        for param in python_param_settings:
+            if "__" in param:
+                # parameter of subflow. will be handled later
+                continue
+            if isinstance(python_param_settings[param], BaseEstimator):
+                # extract parameters of the subflow individually
+                subflow = flow.components[param]
+                openml_param_settings += OpenMLRun._parse_parameters(python_param_settings[param], subflow)
+
+            # add parameter setting (in some cases also the subflow. Just because we can)
+            if param in flow.parameters.keys():
+                param_dict = OrderedDict()
+                param_dict['oml:name'] = param
+                param_dict['oml:value'] = str(python_param_settings[param])
+                param_dict['oml:component'] = flow_dict[flow.name]
+                openml_param_settings.append(param_dict)
             else:
-                raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name))
+                if flow.name.startswith("sklearn.pipeline.Pipeline"):
+                    # tolerate
+                    pass
+                elif flow.name.startswith("sklearn.pipeline.FeatureUnion"):
+                    # tolerate
+                    pass
+                else:
+                    raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name))
 
-    return openml_param_settings
+        return openml_param_settings
 
 ################################################################################
 # Functions which cannot be in runs/functions due to circular imports
diff --git a/openml/setups/__init__.py b/openml/setups/__init__.py
@@ -0,0 +1,3 @@
+from .functions import setup_exists
+
+__all__ = ['setup_exists']
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -0,0 +1,39 @@
+import openml
+import xmltodict
+
+from collections import OrderedDict
+
+def setup_exists(downloaded_flow, sklearn_model):
+    '''
+    Checks whether a flow / hyperparameter configuration already exists on the server
+
+    :param downloaded_flow:
+        the openml flow object (should be downloaded from server.
+        Otherwise also give flow id parameter)
+    :param sklearn_model: obvious
+    :param flow_id: int
+    :return: int setup id iff exists, False otherwise
+    '''
+
+    # sadly, this api call relies on a run object
+    openml_param_settings = openml.runs.OpenMLRun._parse_parameters(sklearn_model, downloaded_flow)
+    description = xmltodict.unparse(_to_dict(downloaded_flow.flow_id, openml_param_settings), pretty=True)
+    file_elements = {'description': ('description.arff',description)}
+
+    result = openml._api_calls._perform_api_call('/setup/exists/',
+                                                 file_elements = file_elements)
+    result_dict = xmltodict.parse(result)
+    if 'oml:id' in result_dict['oml:setup_exists']:
+        return int(result_dict['oml:setup_exists']['oml:id'])
+    else:
+        return False
+
+
+def _to_dict(flow_id, openml_parameter_settings):
+    xml = OrderedDict()
+    xml['oml:run'] = OrderedDict()
+    xml['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
+    xml['oml:run']['oml:flow_id'] = flow_id
+    xml['oml:run']['oml:parameter_setting'] = openml_parameter_settings
+
+    return xml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .functions import setup_exists`
	`2`	`+`
	`3`	`+__all__ = ['setup_exists']`