11# License: BSD 3-Clause
22from __future__ import annotations
33
4+ import logging
45import warnings
56from abc import ABC
67from collections .abc import Sequence
78from enum import Enum
8- from pathlib import Path
99from typing import TYPE_CHECKING , Any , ClassVar
1010from typing_extensions import TypedDict
1111
12+ import arff
13+
1214import openml ._api_calls
1315import openml .config
1416from openml import datasets
2224 import pandas as pd
2325
2426
27+ logger = logging .getLogger (__name__ )
28+
29+
2530# TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used
2631# and stored on server.
2732class TaskType (Enum ):
@@ -178,18 +183,6 @@ def get_train_test_split_indices(
178183
179184 return self .split .get (repeat = repeat , fold = fold , sample = sample )
180185
181- def _download_split (self , cache_file : Path ) -> None :
182- # TODO(eddiebergman): Not sure about this try to read and error approach
183- try :
184- with cache_file .open (encoding = "utf8" ):
185- pass
186- except OSError :
187- split_url = self .estimation_procedure ["data_splits_url" ]
188- openml ._api_calls ._download_text_file (
189- source = str (split_url ),
190- output_path = str (cache_file ),
191- )
192-
193186 def download_split (self ) -> OpenMLSplit :
194187 """Download the OpenML split for a given task."""
195188 # TODO(eddiebergman): Can this every be `None`?
@@ -199,9 +192,23 @@ def download_split(self) -> OpenMLSplit:
199192
200193 try :
201194 split = OpenMLSplit ._from_arff_file (cached_split_file )
202- except OSError :
195+ logger .debug ("Loaded file from cache: %s" , str (cached_split_file ))
196+ except (OSError , arff .BadDataFormat ):
197+ logger .info ("Failed to load file from cache: %s" , str (cached_split_file ))
198+ if cached_split_file .exists ():
199+ logger .debug ("Cleaning up old file" )
200+ cached_split_file .unlink ()
203201 # Next, download and cache the associated split file
204- self ._download_split (cached_split_file )
202+ split_url = self .estimation_procedure ["data_splits_url" ]
203+ openml ._api_calls ._download_text_file (
204+ source = str (split_url ),
205+ output_path = str (cached_split_file ),
206+ )
207+ if cached_split_file .exists ():
208+ logger .info ("New file created of size %d" , cached_split_file .stat ().st_size )
209+ else :
210+ logger .info ("Failed to create new file" )
211+
205212 split = OpenMLSplit ._from_arff_file (cached_split_file )
206213
207214 return split
0 commit comments