Finish pep8 compliance (#630)

mfeurer · web-flow · commit 09806737a79e · 2019-02-25T10:30:02.000+01:00
* PEP8 remove pep8 violations

* Typo.
diff --git a/ci_scripts/flake8_diff.sh b/ci_scripts/flake8_diff.sh
@@ -1,156 +1,3 @@
 #!/bin/bash
 
-# Inspired from https://github.com/scikit-learn/scikit-learn/blob/master/build_tools/travis/flake8_diff.sh
-
-# This script is used in Travis to check that PRs do not add obvious
-# flake8 violations. It relies on two things:
-#   - find common ancestor between branch and
-#     openml/openml-python remote
-#   - run flake8 --diff on the diff between the branch and the common
-#     ancestor
-#
-# Additional features:
-#   - the line numbers in Travis match the local branch on the PR
-#     author machine.
-#   - ./ci_scripts/flake8_diff.sh can be run locally for quick
-#     turn-around
-
-set -e
-# pipefail is necessary to propagate exit codes
-set -o pipefail
-
-PROJECT=openml/openml-python
-PROJECT_URL=https://github.com/$PROJECT.git
-
-# Find the remote with the project name (upstream in most cases)
-REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '')
-
-# Add a temporary remote if needed. For example this is necessary when
-# Travis is configured to run in a fork. In this case 'origin' is the
-# fork and not the reference repo we want to diff against.
-if [[ -z "$REMOTE" ]]; then
-    TMP_REMOTE=tmp_reference_upstream
-    REMOTE=$TMP_REMOTE
-    git remote add $REMOTE $PROJECT_URL
-fi
-
-echo "Remotes:"
-echo '--------------------------------------------------------------------------------'
-git remote --verbose
-
-echo "Travis variables:"
-echo '--------------------------------------------------------------------------------'
-echo "On travis: $TRAVIS"
-echo "Current branch: $TRAVIS_BRANCH"
-echo "Is a pull request test: $TRAVIS_PULL_REQUEST"
-echo "Repository: $TRAVIS_REPO_SLUG"
-
-# Travis does the git clone with a limited depth (50 at the time of
-# writing). This may not be enough to find the common ancestor with
-# $REMOTE/develop so we unshallow the git checkout
-if [[ -a .git/shallow ]]; then
-    echo -e '\nTrying to unshallow the repo:'
-    echo '--------------------------------------------------------------------------------'
-    git fetch --unshallow
-fi
-
-if [[ "$TRAVIS" == "true" ]]; then
-    if [[ "$TRAVIS_BRANCH" == "master" ]]
-    then
-        # We do not test PEP8 on the master branch (or for the PR test into
-        # master) as this results in failures which are only shown for the
-        # pull request to finish a release (development to master) and are
-        # therefore a pain to fix
-        exit 0
-    fi
-    if [[ "$TRAVIS_PULL_REQUEST" == "false" ]]
-    then
-        # In main repo, using TRAVIS_COMMIT_RANGE to test the commits
-        # that were pushed into a branch
-        if [[ "$PROJECT" == "$TRAVIS_REPO_SLUG" ]]; then
-            if [[ -z "$TRAVIS_COMMIT_RANGE" ]]; then
-                echo "New branch, no commit range from Travis so passing this test by convention"
-                exit 0
-            fi
-            COMMIT_RANGE=$TRAVIS_COMMIT_RANGE
-        fi
-    else
-        # We want to fetch the code as it is in the PR branch and not
-        # the result of the merge into develop. This way line numbers
-        # reported by Travis will match with the local code.
-        LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST
-        # In Travis the PR target is always origin
-        git fetch origin pull/$TRAVIS_PULL_REQUEST/head:refs/$LOCAL_BRANCH_REF
-    fi
-fi
-
-# If not using the commit range from Travis we need to find the common
-# ancestor between $LOCAL_BRANCH_REF and $REMOTE/develop
-if [[ -z "$COMMIT_RANGE" ]]; then
-    if [[ -z "$LOCAL_BRANCH_REF" ]]; then
-        LOCAL_BRANCH_REF=$(git rev-parse --abbrev-ref HEAD)
-    fi
-    echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:"
-    echo '--------------------------------------------------------------------------------'
-    git --no-pager log -2 $LOCAL_BRANCH_REF
-
-    REMOTE_DEV_REF="$REMOTE/develop"
-    # Make sure that $REMOTE_DEV_REF is a valid reference
-    echo -e "\nFetching $REMOTE_DEV_REF"
-    echo '--------------------------------------------------------------------------------'
-    git fetch $REMOTE develop:refs/remotes/$REMOTE_DEV_REF
-    LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF)
-    REMOTE_DEV_SHORT_HASH=$(git rev-parse --short $REMOTE_DEV_REF)
-
-    COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_DEV_REF) || \
-        echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_DEV_REF -q)"
-
-    if [ -z "$COMMIT" ]; then
-        exit 1
-    fi
-
-    COMMIT_SHORT_HASH=$(git rev-parse --short $COMMIT)
-
-    echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\
-         "and $REMOTE_DEV_REF ($REMOTE_DEV_SHORT_HASH) is $COMMIT_SHORT_HASH:"
-    echo '--------------------------------------------------------------------------------'
-    git --no-pager show --no-patch $COMMIT_SHORT_HASH
-
-    COMMIT_RANGE="$COMMIT_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH"
-
-    if [[ -n "$TMP_REMOTE" ]]; then
-        git remote remove $TMP_REMOTE
-    fi
-
-else
-    echo "Got the commit range from Travis: $COMMIT_RANGE"
-fi
-
-echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \
-     "($(git rev-list $COMMIT_RANGE | wc -l) commit(s)):"
-echo '--------------------------------------------------------------------------------'
-# We need the following command to exit with 0 hence the echo in case
-# there is no match
-MODIFIED_FILES="$(git diff --no-ext-diff --name-only $COMMIT_RANGE || echo "no_match")"
-
-check_files() {
-    files="$1"
-    shift
-    options="$*"
-    if [ -n "$files" ]; then
-        # Conservative approach: diff without context (--unified=0) so that code
-        # that was not changed does not create failures
-        # git diff --no-ext-diff --unified=0 $COMMIT_RANGE -- $files | flake8 --ignore E402 --diff --show-source $options
-        flake8 --ignore E402,W503 --show-source --max-line-length 100 $options
-    fi
-}
-
-if [[ "$MODIFIED_FILES" == "no_match" ]]; then
-    echo "No file has been modified"
-else
-
-    check_files "$(echo "$MODIFIED_FILES" | grep -v ^examples)"
-    check_files "$(echo "$MODIFIED_FILES" | grep ^examples)" \
-        --config ./examples/.flake8
-fi
-echo -e "No problem detected by flake8\n"
+flake8 --ignore E402,W503 --show-source --max-line-length 100 $options
diff --git a/examples/datasets_tutorial.py b/examples/datasets_tutorial.py
@@ -54,7 +54,7 @@
 
 ############################################################################
 # Get the actual data.
-# 
+#
 # Returned as numpy array, with meta-info
 # (e.g. target feature, feature names, ...)
 X, y, attribute_names = dataset.get_data(
diff --git a/examples/flows_and_runs_tutorial.py b/examples/flows_and_runs_tutorial.py
@@ -58,7 +58,8 @@
 ############################################################################
 # Share the run on the OpenML server
 #
-# So far the run is only available locally. By calling the publish function, the run is sent to the OpenML server:
+# So far the run is only available locally. By calling the publish function,
+# the run is sent to the OpenML server:
 
 myrun = run.publish()
 # For this tutorial, our configuration publishes to the test server
@@ -96,11 +97,16 @@
 # compare your results with the rest of the class and learn from
 # them. Some tasks you could try (or browse openml.org):
 #
-# * EEG eye state: data_id:`1471 <http://www.openml.org/d/1471>`_, task_id:`14951 <http://www.openml.org/t/14951>`_
-# * Volcanoes on Venus: data_id:`1527 <http://www.openml.org/d/1527>`_, task_id:`10103 <http://www.openml.org/t/10103>`_
-# * Walking activity: data_id:`1509 <http://www.openml.org/d/1509>`_, task_id:`9945 <http://www.openml.org/t/9945>`_, 150k instances.
-# * Covertype (Satellite): data_id:`150 <http://www.openml.org/d/150>`_, task_id:`218 <http://www.openml.org/t/218>`_, 500k instances.
-# * Higgs (Physics): data_id:`23512 <http://www.openml.org/d/23512>`_, task_id:`52950 <http://www.openml.org/t/52950>`_, 100k instances, missing values.
+# * EEG eye state: data_id:`1471 <http://www.openml.org/d/1471>`_,
+#   task_id:`14951 <http://www.openml.org/t/14951>`_
+# * Volcanoes on Venus: data_id:`1527 <http://www.openml.org/d/1527>`_,
+#   task_id:`10103 <http://www.openml.org/t/10103>`_
+# * Walking activity: data_id:`1509 <http://www.openml.org/d/1509>`_,
+#   task_id:`9945 <http://www.openml.org/t/9945>`_, 150k instances.
+# * Covertype (Satellite): data_id:`150 <http://www.openml.org/d/150>`_,
+#   task_id:`218 <http://www.openml.org/t/218>`_, 500k instances.
+# * Higgs (Physics): data_id:`23512 <http://www.openml.org/d/23512>`_,
+#   task_id:`52950 <http://www.openml.org/t/52950>`_, 100k instances, missing values.
 
 # Easy benchmarking:
 for task_id in [115, ]:  # Add further tasks. Disclaimer: they might take some time
diff --git a/examples/introduction_tutorial.py b/examples/introduction_tutorial.py
@@ -23,21 +23,27 @@
 #
 #     pip install openml
 #
-# For further information, please check out the installation guide at https://openml.github.io/openml-python/stable/contributing.html#installation
+# For further information, please check out the installation guide at
+# https://openml.github.io/openml-python/master/contributing.html#installation
 #
 # Authentication
 # ^^^^^^^^^^^^^^
 #
-# The OpenML server can only be accessed by users who have signed up on the OpenML platform. If you don’t have an account yet, sign up now.
-# You will receive an API key, which will authenticate you to the server and allow you to download and upload datasets, tasks, runs and flows.
+# The OpenML server can only be accessed by users who have signed up on the
+# OpenML platform. If you don’t have an account yet, sign up now.
+# You will receive an API key, which will authenticate you to the server
+# and allow you to download and upload datasets, tasks, runs and flows.
 #
 # * Create an OpenML account (free) on http://www.openml.org.
 # * After logging in, open your account page (avatar on the top right)
 # * Open 'Account Settings', then 'API authentication' to find your API key.
 #
 # There are two ways to authenticate:
 #
-# * Create a plain text file **~/.openml/config** with the line **'apikey=MYKEY'**, replacing **MYKEY** with your API key. The config file must be in the directory ~/.openml/config and exist prior to importing the openml module
+# * Create a plain text file **~/.openml/config** with the line
+#   **'apikey=MYKEY'**, replacing **MYKEY** with your API key. The config
+#   file must be in the directory ~/.openml/config and exist prior to
+#   importing the openml module.
 # * Run the code below, replacing 'YOURKEY' with your API key.
 
 ############################################################################
@@ -50,13 +56,18 @@
 ############################################################################
 # Caching
 # ^^^^^^^
-# When downloading datasets, tasks, runs and flows, they will be cached to retrieve them without calling the server later. As with the API key, the cache directory can be either specified through the config file or through the API:
+# When downloading datasets, tasks, runs and flows, they will be cached to
+# retrieve them without calling the server later. As with the API key,
+# the cache directory can be either specified through the config file or
+# through the API:
 #
-# * Add the  line **cachedir = 'MYDIR'** to the config file, replacing 'MYDIR' with the path to the cache directory. By default, OpenML will use **~/.openml/cache** as the cache directory.
+# * Add the  line **cachedir = 'MYDIR'** to the config file, replacing
+#   'MYDIR' with the path to the cache directory. By default, OpenML
+#   will use **~/.openml/cache** as the cache directory.
 # * Run the code below, replacing 'YOURDIR' with the path to the cache directory.
 
-import os
 # Uncomment and set your OpenML cache directory
+# import os
 # openml.config.cache_directory = os.path.expanduser('YOURDIR')
 
 ############################################################################
diff --git a/examples/run_setup_tutorial.py b/examples/run_setup_tutorial.py
@@ -24,6 +24,7 @@
     2) Download the flow, reinstantiate the model with same hyperparameters,
        and solve the same task again;
     3) We will verify that the obtained results are exactly the same.
+
 """
 import logging
 import numpy as np
@@ -75,8 +76,7 @@
 run_original = run.publish()  # this implicitly uploads the flow
 
 ###############################################################################
-# 2) Download the flow, reinstantiate the model with same hyperparameters,
-#    and solve the same task again.
+# 2) Download the flow and solve the same task again.
 ###############################################################################
 
 # obtain setup id (note that the setup id is assigned by the OpenML server -
diff --git a/examples/tasks_tutorial.py b/examples/tasks_tutorial.py
@@ -13,9 +13,16 @@
 #
 # Tasks are identified by IDs and can be accessed in two different ways:
 #
-# 1. In a list providing basic information on all tasks available on OpenML. This function will not download the actual tasks, but will instead download meta data that can be used to filter the tasks and retrieve a set of IDs. We can filter this list, for example, we can only list tasks having a special tag or only tasks for a specific target such as *supervised classification*.
+# 1. In a list providing basic information on all tasks available on OpenML.
+# This function will not download the actual tasks, but will instead download
+# meta data that can be used to filter the tasks and retrieve a set of IDs.
+# We can filter this list, for example, we can only list tasks having a
+# special tag or only tasks for a specific target such as
+# *supervised classification*.
 #
-# 2. A single task by its ID. It contains all meta information, the target metric, the splits and an iterator which can be used to access the splits in a useful manner.
+# 2. A single task by its ID. It contains all meta information, the target
+# metric, the splits and an iterator which can be used to access the
+# splits in a useful manner.
 
 ############################################################################
 # Listing tasks
@@ -36,7 +43,8 @@
 pprint(tasks.head())
 
 ############################################################################
-# We can filter the list of tasks to only contain datasets with more than 500 samples, but less than 1000 samples:
+# We can filter the list of tasks to only contain datasets with more than
+# 500 samples, but less than 1000 samples:
 
 filtered_tasks = tasks.query('NumberOfInstances > 500 and NumberOfInstances < 1000')
 print(list(filtered_tasks.index))
@@ -58,7 +66,8 @@
 print(len(filtered_tasks))
 
 ############################################################################
-# Resampling strategies can be found on the `OpenML Website <http://www.openml.org/search?type=measure&q=estimation%20procedure>`_.
+# Resampling strategies can be found on the
+# `OpenML Website <http://www.openml.org/search?type=measure&q=estimation%20procedure>`_.
 #
 # Similar to listing tasks by task type, we can list tasks by tags:
 
@@ -111,7 +120,9 @@
 # Downloading tasks
 # ^^^^^^^^^^^^^^^^^
 #
-# We provide two functions to download tasks, one which downloads only a single task by its ID, and one which takes a list of IDs and downloads all of these tasks:
+# We provide two functions to download tasks, one which downloads only a
+# single task by its ID, and one which takes a list of IDs and downloads
+# all of these tasks:
 
 task_id = 1
 task = openml.tasks.get_task(task_id)
@@ -127,5 +138,3 @@
 ids = [1, 2, 19, 97, 403]
 tasks = openml.tasks.get_tasks(ids)
 pprint(tasks[0])
-
-
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
@@ -1,4 +1,5 @@
 from .functions import (
+    attributes_arff_from_df,
     check_datasets_active,
     create_dataset,
     get_dataset,
@@ -10,6 +11,7 @@
 from .data_feature import OpenMLDataFeature
 
 __all__ = [
+    'attributes_arff_from_df',
     'check_datasets_active',
     'create_dataset',
     'get_dataset',
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
@@ -148,12 +148,12 @@ def test_study_attach_illegal(self):
         study_id = study.publish()
         study_original = openml.study.get_study(study_id)
 
-        with self.assertRaisesRegex(openml.exceptions.OpenMLServerException, 
+        with self.assertRaisesRegex(openml.exceptions.OpenMLServerException,
                                     'Problem attaching entities.'):
             # run id does not exists
             openml.study.attach_to_study(study_id, [0])
 
-        with self.assertRaisesRegex(openml.exceptions.OpenMLServerException, 
+        with self.assertRaisesRegex(openml.exceptions.OpenMLServerException,
                                     'Problem attaching entities.'):
             # some runs already attached
             openml.study.attach_to_study(study_id, list(run_list_more.keys()))
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
@@ -47,7 +47,7 @@ def test_list_datasets_with_high_size_parameter(self):
         datasets_b = openml.datasets.list_datasets(size=np.inf)
 
         # note that in the meantime the number of datasets could have increased
-        # due to tests that run in parralel. 
+        # due to tests that run in parallel.
         self.assertGreaterEqual(len(datasets_b), len(datasets_a))
 
     def test_list_all_for_tasks(self):