Add some logging to identify the issue with file failures in CI

PGijsbers · PGijsbers · commit fbc18292f02d · 2026-03-09T15:05:37.000+01:00
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
@@ -1,14 +1,16 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
+import logging
 import warnings
 from abc import ABC
 from collections.abc import Sequence
 from enum import Enum
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar
 from typing_extensions import TypedDict
 
+import arff
+
 import openml._api_calls
 import openml.config
 from openml import datasets
@@ -22,6 +24,9 @@
     import pandas as pd
 
 
+logger = logging.getLogger(__name__)
+
+
 # TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used
 # and stored on server.
 class TaskType(Enum):
@@ -178,18 +183,6 @@ def get_train_test_split_indices(
 
         return self.split.get(repeat=repeat, fold=fold, sample=sample)
 
-    def _download_split(self, cache_file: Path) -> None:
-        # TODO(eddiebergman): Not sure about this try to read and error approach
-        try:
-            with cache_file.open(encoding="utf8"):
-                pass
-        except OSError:
-            split_url = self.estimation_procedure["data_splits_url"]
-            openml._api_calls._download_text_file(
-                source=str(split_url),
-                output_path=str(cache_file),
-            )
-
     def download_split(self) -> OpenMLSplit:
         """Download the OpenML split for a given task."""
         # TODO(eddiebergman): Can this every be `None`?
@@ -199,9 +192,23 @@ def download_split(self) -> OpenMLSplit:
 
         try:
             split = OpenMLSplit._from_arff_file(cached_split_file)
-        except OSError:
+            logger.debug("Loaded file from cache: %s", str(cached_split_file))
+        except (OSError, arff.BadDataFormat):
+            logger.info("Failed to load file from cache: %s", str(cached_split_file))
+            if cached_split_file.exists():
+                logger.debug("Cleaning up old file")
+                cached_split_file.unlink()
             # Next, download and cache the associated split file
-            self._download_split(cached_split_file)
+            split_url = self.estimation_procedure["data_splits_url"]
+            openml._api_calls._download_text_file(
+                source=str(split_url),
+                output_path=str(cached_split_file),
+            )
+            if cached_split_file.exists():
+                logger.info("New file created of size %d", cached_split_file.stat().st_size)
+            else:
+                logger.info("Failed to create new file")
+
             split = OpenMLSplit._from_arff_file(cached_split_file)
 
         return split
diff --git a/pyproject.toml b/pyproject.toml
@@ -126,6 +126,7 @@ version = {attr = "openml.__version__.__version__"}
 
 # https://docs.pytest.org/en/7.2.x/reference/reference.html#ini-options-ref
 [tool.pytest.ini_options]
+log_level="DEBUG"
 testpaths = ["tests"]
 minversion = "7.0"
 xfail_strict = true