Skip to content

Commit aecb6ac

Browse files
PGijsbersmfeurer
authored andcommitted
Fix612 lazy download dataset (#644)
* First iteration of lazy loading. Does not yet take into account all places that might use the arff file internally. * Factor functionality of loading ARFF to correct data format and pickling it out of __init__. * Extracted a more general 'download_text_file' function that is now used when downloading the arff file. * Download data when get_data is called and it had not yet been downloaded. * Update unit tests. * Also check if download is required for retrieve class labels. * add test to ensure all functionality works without retrieving data. * update doc/hint. * Flake8, unused imports, spacing around = * Always return path to pickle file. * Add notice of lazy loading to dataset tutorial. * Simplified `retrieve_class_labels` using the already downloaded feature metadata. * Fix a bug where nominal feature with a single unique value is treated differently from one with multiple (e.g. feat 5 of d/2). * Apply AppVeyor fix. * Update feature xml to most recent. * Update test to reflect retrieve_class_labels is now available with lazy loading. * Unify loading of features between cached and downloaded. * Flake8. * Add random element to tag to avoid race conditions in parallel tests.
1 parent 94102f3 commit aecb6ac

7 files changed

Lines changed: 481 additions & 233 deletions

File tree

appveyor.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,8 @@ install:
2929
- rmdir C:\\cygwin /s /q
3030

3131
# Update previous packages and install the build and runtime dependencies of the project.
32-
# XXX: setuptools>23 is currently broken on Win+py3 with numpy
33-
# (https://github.com/pypa/setuptools/issues/728)
34-
- conda update --all --yes setuptools=23
32+
- conda update conda --yes
33+
- conda update --all --yes
3534

3635
# Install the build and runtime dependencies of the project.
3736
- "cd C:\\projects\\openml-python"

examples/datasets_tutorial.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,15 @@
7777
print(X.head())
7878
print(X.info())
7979

80+
############################################################################
81+
# Sometimes you only need access to a dataset's metadata.
82+
# In those cases, you can download the dataset without downloading the
83+
# data file. The dataset object can be used as normal.
84+
# Whenever you use any functionality that requires the data,
85+
# such as `get_data`, the data will be downloaded.
86+
dataset = openml.datasets.get_dataset(68, download_data=False)
87+
88+
8089
############################################################################
8190
# Exercise 2
8291
# **********

openml/datasets/dataset.py

Lines changed: 123 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def __init__(self, name, description, format=None,
157157
feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
158158
xmlfeature['oml:name'],
159159
xmlfeature['oml:data_type'],
160-
None,
160+
xmlfeature.get('oml:nominal_value'),
161161
int(nr_missing))
162162
if idx != feature.index:
163163
raise ValueError('Data features not provided '
@@ -167,96 +167,104 @@ def __init__(self, name, description, format=None,
167167
self.qualities = _check_qualities(qualities)
168168

169169
if data_file is not None:
170-
self.data_pickle_file = data_file.replace('.arff', '.pkl.py3')
170+
self.data_pickle_file = self._data_arff_to_pickle(data_file)
171+
else:
172+
self.data_pickle_file = None
171173

172-
if os.path.exists(self.data_pickle_file):
173-
logger.debug("Data pickle file already exists.")
174-
else:
175-
try:
176-
data = self._get_arff(self.format)
177-
except OSError as e:
178-
logger.critical("Please check that the data file %s is "
179-
"there and can be read.", self.data_file)
180-
raise e
181-
182-
ARFF_DTYPES_TO_PD_DTYPE = {
183-
'INTEGER': 'integer',
184-
'REAL': 'floating',
185-
'NUMERIC': 'floating',
186-
'STRING': 'string'
187-
}
188-
attribute_dtype = {}
189-
attribute_names = []
190-
categories_names = {}
191-
categorical = []
192-
for name, type_ in data['attributes']:
193-
# if the feature is nominal and the a sparse matrix is
194-
# requested, the categories need to be numeric
195-
if (isinstance(type_, list)
196-
and self.format.lower() == 'sparse_arff'):
197-
try:
198-
np.array(type_, dtype=np.float32)
199-
except ValueError:
200-
raise ValueError(
201-
"Categorical data needs to be numeric when "
202-
"using sparse ARFF."
203-
)
204-
# string can only be supported with pandas DataFrame
205-
elif (type_ == 'STRING'
206-
and self.format.lower() == 'sparse_arff'):
174+
def _data_arff_to_pickle(self, data_file):
175+
data_pickle_file = data_file.replace('.arff', '.pkl.py3')
176+
if os.path.exists(data_pickle_file):
177+
logger.debug("Data pickle file already exists.")
178+
return data_pickle_file
179+
else:
180+
try:
181+
data = self._get_arff(self.format)
182+
except OSError as e:
183+
logger.critical("Please check that the data file %s is "
184+
"there and can be read.", data_file)
185+
raise e
186+
187+
ARFF_DTYPES_TO_PD_DTYPE = {
188+
'INTEGER': 'integer',
189+
'REAL': 'floating',
190+
'NUMERIC': 'floating',
191+
'STRING': 'string'
192+
}
193+
attribute_dtype = {}
194+
attribute_names = []
195+
categories_names = {}
196+
categorical = []
197+
for name, type_ in data['attributes']:
198+
# if the feature is nominal and the a sparse matrix is
199+
# requested, the categories need to be numeric
200+
if (isinstance(type_, list)
201+
and self.format.lower() == 'sparse_arff'):
202+
try:
203+
np.array(type_, dtype=np.float32)
204+
except ValueError:
207205
raise ValueError(
208-
"Dataset containing strings is not supported "
209-
"with sparse ARFF."
206+
"Categorical data needs to be numeric when "
207+
"using sparse ARFF."
210208
)
211-
212-
# infer the dtype from the ARFF header
213-
if isinstance(type_, list):
214-
categorical.append(True)
215-
categories_names[name] = type_
216-
if len(type_) == 2:
217-
type_norm = [cat.lower().capitalize()
218-
for cat in type_]
219-
if set(['True', 'False']) == set(type_norm):
220-
categories_names[name] = [
221-
True if cat == 'True' else False
222-
for cat in type_norm
223-
]
224-
attribute_dtype[name] = 'boolean'
225-
else:
226-
attribute_dtype[name] = 'categorical'
209+
# string can only be supported with pandas DataFrame
210+
elif (type_ == 'STRING'
211+
and self.format.lower() == 'sparse_arff'):
212+
raise ValueError(
213+
"Dataset containing strings is not supported "
214+
"with sparse ARFF."
215+
)
216+
217+
# infer the dtype from the ARFF header
218+
if isinstance(type_, list):
219+
categorical.append(True)
220+
categories_names[name] = type_
221+
if len(type_) == 2:
222+
type_norm = [cat.lower().capitalize()
223+
for cat in type_]
224+
if set(['True', 'False']) == set(type_norm):
225+
categories_names[name] = [
226+
True if cat == 'True' else False
227+
for cat in type_norm
228+
]
229+
attribute_dtype[name] = 'boolean'
227230
else:
228231
attribute_dtype[name] = 'categorical'
229232
else:
230-
categorical.append(False)
231-
attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
232-
attribute_names.append(name)
233-
234-
if self.format.lower() == 'sparse_arff':
235-
X = data['data']
236-
X_shape = (max(X[1]) + 1, max(X[2]) + 1)
237-
X = scipy.sparse.coo_matrix(
238-
(X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
239-
X = X.tocsr()
240-
241-
elif self.format.lower() == 'arff':
242-
X = pd.DataFrame(data['data'], columns=attribute_names)
243-
244-
col = []
245-
for column_name in X.columns:
246-
if attribute_dtype[column_name] in ('categorical',
247-
'boolean'):
248-
col.append(self._unpack_categories(
249-
X[column_name], categories_names[column_name]))
250-
else:
251-
col.append(X[column_name])
252-
X = pd.concat(col, axis=1)
253-
254-
# Pickle the dataframe or the sparse matrix.
255-
with open(self.data_pickle_file, "wb") as fh:
256-
pickle.dump((X, categorical, attribute_names), fh, -1)
257-
logger.debug("Saved dataset %d: %s to file %s" %
258-
(int(self.dataset_id or -1), self.name,
259-
self.data_pickle_file))
233+
attribute_dtype[name] = 'categorical'
234+
else:
235+
categorical.append(False)
236+
attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
237+
attribute_names.append(name)
238+
239+
if self.format.lower() == 'sparse_arff':
240+
X = data['data']
241+
X_shape = (max(X[1]) + 1, max(X[2]) + 1)
242+
X = scipy.sparse.coo_matrix(
243+
(X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
244+
X = X.tocsr()
245+
246+
elif self.format.lower() == 'arff':
247+
X = pd.DataFrame(data['data'], columns=attribute_names)
248+
249+
col = []
250+
for column_name in X.columns:
251+
if attribute_dtype[column_name] in ('categorical',
252+
'boolean'):
253+
col.append(self._unpack_categories(
254+
X[column_name], categories_names[column_name]))
255+
else:
256+
col.append(X[column_name])
257+
X = pd.concat(col, axis=1)
258+
259+
# Pickle the dataframe or the sparse matrix.
260+
with open(data_pickle_file, "wb") as fh:
261+
pickle.dump((X, categorical, attribute_names), fh, -1)
262+
logger.debug("Saved dataset {did}: {name} to file {path}"
263+
.format(did=int(self.dataset_id or -1),
264+
name=self.name,
265+
path=data_pickle_file)
266+
)
267+
return data_pickle_file
260268

261269
def push_tag(self, tag):
262270
"""Annotates this data set with a tag on the server.
@@ -394,13 +402,19 @@ def _unpack_categories(series, categories):
394402
return pd.Series(col, index=series.index, dtype='category',
395403
name=series.name)
396404

397-
def get_data(self, target=None,
398-
include_row_id=False,
399-
include_ignore_attributes=False,
400-
return_categorical_indicator=False,
401-
return_attribute_names=False,
402-
dataset_format=None):
403-
"""Returns dataset content as dataframes or sparse matrices.
405+
def _download_data(self) -> None:
406+
""" Download ARFF data file to standard cache directory. Set `self.data_file`. """
407+
# import required here to avoid circular import.
408+
from .functions import _get_dataset_arff
409+
self.data_file = _get_dataset_arff(self)
410+
411+
def get_data(self, target: str = None,
412+
include_row_id: bool = False,
413+
include_ignore_attributes: bool = False,
414+
return_categorical_indicator: bool = False,
415+
return_attribute_names: bool = False,
416+
dataset_format: str = None):
417+
""" Returns dataset content as dataframes or sparse matrices.
404418
405419
Parameters
406420
----------
@@ -416,10 +430,10 @@ def get_data(self, target=None,
416430
categorical.
417431
return_attribute_names : boolean (default=False)
418432
Whether to return attribute names.
419-
dataset_format : string
420-
The format of returned dataset. If ``array``, the returned dataset
421-
will be a NumPy array or a SciPy sparse matrix. If ``dataframe``,
422-
the returned dataset will be a Pandas DataFrame or SparseDataFrame.
433+
dataset_format : string, optional
434+
The format of returned dataset.
435+
If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
436+
If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame.
423437
424438
Returns
425439
-------
@@ -428,12 +442,11 @@ def get_data(self, target=None,
428442
y : ndarray or series, shape (n_samples,)
429443
Target column(s). Only returned if target is not None.
430444
categorical_indicator : boolean ndarray
431-
Mask that indicate categorical features. Only returned if
432-
return_categorical_indicator is True.
445+
Mask that indicate categorical features.
446+
Only returned if return_categorical_indicator is True.
433447
return_attribute_names : list of strings
434-
List of attribute names. Returned only if return_attribute_names is
435-
True.
436-
448+
List of attribute names.
449+
Only returned if return_attribute_names is True.
437450
"""
438451
if dataset_format is None:
439452
warn('The default of "dataset_format" will change from "array" to'
@@ -442,6 +455,11 @@ def get_data(self, target=None,
442455

443456
rval = []
444457

458+
if self.data_pickle_file is None:
459+
if self.data_file is None:
460+
self._download_data()
461+
self.data_pickle_file = self._data_arff_to_pickle(self.data_file)
462+
445463
path = self.data_pickle_file
446464
if not os.path.exists(path):
447465
raise ValueError("Cannot find a pickle file for dataset %s at "
@@ -554,26 +572,10 @@ def retrieve_class_labels(self, target_name='class'):
554572
-------
555573
list
556574
"""
557-
558-
# TODO improve performance, currently reads the whole file
559-
# Should make a method that only reads the attributes
560-
arffFileName = self.data_file
561-
562-
if self.format.lower() == 'arff':
563-
return_type = arff.DENSE
564-
elif self.format.lower() == 'sparse_arff':
565-
return_type = arff.COO
566-
else:
567-
raise ValueError('Unknown data format %s' % self.format)
568-
569-
with io.open(arffFileName, encoding='utf8') as fh:
570-
arffData = arff.ArffDecoder().decode(fh, return_type=return_type)
571-
572-
dataAttributes = dict(arffData['attributes'])
573-
if target_name in dataAttributes:
574-
return dataAttributes[target_name]
575-
else:
576-
return None
575+
for feature in self.features.values():
576+
if (feature.name == target_name) and (feature.data_type == 'nominal'):
577+
return feature.nominal_values
578+
return None
577579

578580
def get_features_by_type(self, data_type, exclude=None,
579581
exclude_ignore_attributes=True,

0 commit comments

Comments
 (0)