Skip to content

Commit b1a9e7f

Browse files
committed
Merge branch 'pr-1577' into migration (merge conflicts)
2 parents 50eed37 + 79f6187 commit b1a9e7f

17 files changed

Lines changed: 127 additions & 61 deletions

File tree

.github/workflows/test.yml

Lines changed: 74 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,27 @@ jobs:
3434
sklearn-only: ["true"]
3535

3636
exclude:
37-
# incompatible version combinations
37+
# (python, sklearn) combinations for which there is no PyPI release
38+
# scikit-learn 1.3
3839
- python-version: "3.13"
3940
scikit-learn: "1.3.*"
40-
- python-version: "3.13"
41-
scikit-learn: "1.4.*"
4241
- python-version: "3.14"
4342
scikit-learn: "1.3.*"
43+
# scikit-learn 1.4
44+
- python-version: "3.13"
45+
scikit-learn: "1.4.*"
4446
- python-version: "3.14"
4547
scikit-learn: "1.4.*"
48+
# scikit-learn 1.5
49+
- python-version: "3.14"
50+
scikit-learn: "1.5.*"
51+
# scikit-learn 1.6
52+
- python-version: "3.14"
53+
scikit-learn: "1.6.*"
54+
# scikit-learn 1.7 is installed with pandas 3
55+
- python-version: "3.10"
56+
scikit-learn: "1.7.*"
57+
4658

4759
include:
4860
# Full test run on ubuntu, 3.14
@@ -64,14 +76,6 @@ jobs:
6476
sklearn-only: "false"
6577
code-cov: true
6678

67-
# Pandas 2 run
68-
- os: ubuntu-latest
69-
python-version: "3.12"
70-
scikit-learn: "1.5.*"
71-
sklearn-only: "false"
72-
pandas-version: "2.*"
73-
code-cov: false
74-
7579
steps:
7680
- uses: actions/checkout@v6
7781
with:
@@ -82,15 +86,21 @@ jobs:
8286
with:
8387
python-version: ${{ matrix.python-version }}
8488

85-
- name: Install test dependencies, scikit-learn, and optional pandas
89+
- name: Install test dependencies, scikit-learn, and pandas
8690
shell: bash
8791
run: |
8892
python -m pip install --upgrade pip
8993
pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}
90-
91-
if [ "${{ matrix.pandas-version }}" != "" ]; then
92-
echo "Installing specific pandas version: ${{ matrix.pandas-version }}"
93-
pip install "pandas==${{ matrix.pandas-version }}"
94+
95+
# scikit-learn 1.7+ requires pandas 3.x, earlier versions use pandas 2.x
96+
version="${{ matrix.scikit-learn }}"
97+
major=$(echo "$version" | cut -d. -f1)
98+
minor=$(echo "$version" | cut -d. -f2)
99+
100+
if [[ "$major" -gt 1 ]] || { [[ "$major" -eq 1 ]] && [[ "$minor" -ge 7 ]]; }; then
101+
pip install "pandas==3.*"
102+
else
103+
pip install "pandas==2.*"
94104
fi
95105
96106
- name: Store repository status
@@ -101,22 +111,46 @@ jobs:
101111
echo "BEFORE=$git_status" >> $GITHUB_ENV
102112
echo "Repository status before tests: $git_status"
103113
114+
- name: Clone Services
115+
if: matrix.os == 'ubuntu-latest'
116+
id: clone-services
117+
run: |
118+
git clone --depth 1 https://github.com/openml/services.git
119+
120+
- name: Start Docker Services
121+
id: start-services
122+
if: matrix.os == 'ubuntu-latest'
123+
working-directory: ./services
124+
run: |
125+
chmod -R a+rw ./data
126+
chmod -R a+rw ./logs
127+
docker compose --profile rest-api --profile minio --profile evaluation-engine up -d
128+
129+
echo "Waiting for PHP API to boot..."
130+
timeout 60s bash -c 'until [ "$(docker inspect -f {{.State.Health.Status}} openml-php-rest-api)" == "healthy" ]; do sleep 5; done'
131+
132+
echo "Final Verification: Gateway Connectivity..."
133+
curl -sSfL http://localhost:8000/api/v1/xml/data/1 | head -n 15
134+
135+
docker container ls
136+
104137
- name: Show installed dependencies
105138
run: python -m pip list
106139

107140
- name: Run tests on Ubuntu Test
108141
if: matrix.os == 'ubuntu-latest'
109142
env:
110143
OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
144+
OPENML_USE_LOCAL_SERVICES: "true"
111145
run: |
112146
if [ "${{ matrix.code-cov }}" = "true" ]; then
113147
codecov="--cov=openml --long --cov-report=xml"
114148
fi
115149
116150
if [ "${{ matrix.sklearn-only }}" = "true" ]; then
117-
marks="sklearn and not production_server and not test_server"
151+
marks="sklearn and not production_server"
118152
else
119-
marks="not production_server and not test_server"
153+
marks="not production_server"
120154
fi
121155
122156
pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -125,15 +159,16 @@ jobs:
125159
if: matrix.os == 'ubuntu-latest'
126160
env:
127161
OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
162+
OPENML_USE_LOCAL_SERVICES: "true"
128163
run: |
129164
if [ "${{ matrix.code-cov }}" = "true" ]; then
130165
codecov="--cov=openml --long --cov-report=xml"
131166
fi
132167
133168
if [ "${{ matrix.sklearn-only }}" = "true" ]; then
134-
marks="sklearn and production_server and not test_server"
169+
marks="sklearn and production_server"
135170
else
136-
marks="production_server and not test_server"
171+
marks="production_server"
137172
fi
138173
139174
pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -145,6 +180,25 @@ jobs:
145180
run: | # we need a separate step because of the bash-specific if-statement in the previous one.
146181
pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
147182
183+
- name: Upload coverage
184+
if: matrix.code-cov && always()
185+
uses: codecov/codecov-action@v4
186+
with:
187+
files: coverage.xml
188+
token: ${{ secrets.CODECOV_TOKEN }}
189+
fail_ci_if_error: true
190+
verbose: true
191+
192+
- name: Dump server logs
193+
if: always() && steps.start-services.outcome == 'success'
194+
run: |
195+
docker logs openml-php-rest-api -t
196+
197+
- name: Cleanup Docker setup
198+
if: always() && steps.clone-services.outcome == 'success'
199+
run: |
200+
sudo rm -rf services
201+
148202
- name: Check for files left behind by test
149203
if: matrix.os != 'windows-latest' && always()
150204
run: |
@@ -157,15 +211,6 @@ jobs:
157211
exit 1
158212
fi
159213
160-
- name: Upload coverage
161-
if: matrix.code-cov && always()
162-
uses: codecov/codecov-action@v4
163-
with:
164-
files: coverage.xml
165-
token: ${{ secrets.CODECOV_TOKEN }}
166-
fail_ci_if_error: true
167-
verbose: true
168-
169214
dummy_windows_py_sk024:
170215
name: (windows-latest, Py, sk0.24.*, sk-only:false)
171216
runs-on: ubuntu-latest

openml/base.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
import xmltodict
1010

11-
import openml
1211
import openml._api_calls
1312

1413
from .utils import _get_rest_api_type_alias, _tag_openml_base

openml/evaluations/evaluation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
from dataclasses import asdict, dataclass
55

6-
import openml
76
import openml.datasets
87
import openml.flows
98
import openml.runs

openml/setups/setup.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from dataclasses import asdict, dataclass
55
from typing import Any
66

7-
import openml
87
import openml.flows
98

109

openml/study/functions.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import pandas as pd
99
import xmltodict
1010

11-
import openml
1211
import openml._api_calls
1312
import openml.utils
1413
from openml.study.study import OpenMLBenchmarkSuite, OpenMLStudy

openml/tasks/task.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
# License: BSD 3-Clause
22
from __future__ import annotations
33

4+
import logging
45
import warnings
56
from abc import ABC
67
from collections.abc import Sequence
78
from enum import Enum
8-
from pathlib import Path
99
from typing import TYPE_CHECKING, Any, ClassVar
1010
from typing_extensions import TypedDict
1111

12-
import openml
12+
import arff
13+
1314
import openml._api_calls
1415
from openml import datasets
1516
from openml.base import OpenMLBase
@@ -22,6 +23,9 @@
2223
import pandas as pd
2324

2425

26+
logger = logging.getLogger(__name__)
27+
28+
2529
# TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used
2630
# and stored on server.
2731
class TaskType(Enum):
@@ -178,18 +182,6 @@ def get_train_test_split_indices(
178182

179183
return self.split.get(repeat=repeat, fold=fold, sample=sample)
180184

181-
def _download_split(self, cache_file: Path) -> None:
182-
# TODO(eddiebergman): Not sure about this try to read and error approach
183-
try:
184-
with cache_file.open(encoding="utf8"):
185-
pass
186-
except OSError:
187-
split_url = self.estimation_procedure["data_splits_url"]
188-
openml._api_calls._download_text_file(
189-
source=str(split_url),
190-
output_path=str(cache_file),
191-
)
192-
193185
def download_split(self) -> OpenMLSplit:
194186
"""Download the OpenML split for a given task."""
195187
# TODO(eddiebergman): Can this every be `None`?
@@ -199,9 +191,23 @@ def download_split(self) -> OpenMLSplit:
199191

200192
try:
201193
split = OpenMLSplit._from_arff_file(cached_split_file)
202-
except OSError:
194+
logger.debug("Loaded file from cache: %s", str(cached_split_file))
195+
except (OSError, arff.BadDataFormat):
196+
logger.info("Failed to load file from cache: %s", str(cached_split_file))
197+
if cached_split_file.exists():
198+
logger.debug("Cleaning up old file")
199+
cached_split_file.unlink()
203200
# Next, download and cache the associated split file
204-
self._download_split(cached_split_file)
201+
split_url = self.estimation_procedure["data_splits_url"]
202+
openml._api_calls._download_text_file(
203+
source=str(split_url),
204+
output_path=str(cached_split_file),
205+
)
206+
if cached_split_file.exists():
207+
logger.info("New file created of size %d", cached_split_file.stat().st_size)
208+
else:
209+
logger.info("Failed to create new file")
210+
205211
split = OpenMLSplit._from_arff_file(cached_split_file)
206212

207213
return split

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ version = {attr = "openml.__version__.__version__"}
126126

127127
# https://docs.pytest.org/en/7.2.x/reference/reference.html#ini-options-ref
128128
[tool.pytest.ini_options]
129+
log_level="DEBUG"
129130
testpaths = ["tests"]
130131
minversion = "7.0"
131132
xfail_strict = true

tests/conftest.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -289,22 +289,35 @@ def as_robot() -> Iterator[None]:
289289
@pytest.fixture(autouse=True)
290290
def with_server(request):
291291
openml.config.set_api_version(APIVersion.V1)
292+
292293
if "production_server" in request.keywords:
294+
# use-production-server (remote)
293295
openml.config.set_servers("production")
294-
yield
295-
return
296-
openml.config.set_servers("test")
296+
elif os.getenv("OPENML_USE_LOCAL_SERVICES") == "true":
297+
# use-test-server (local)
298+
openml.config.set_servers("local")
299+
else:
300+
# use-test-server (remote)
301+
openml.config.set_servers("test")
302+
297303
yield
298304

299305

300306
@pytest.fixture(autouse=True)
301307
def with_test_cache(test_files_directory, request):
308+
# Skip this fixture for TestBase subclasses - they manage their own cache directory
309+
# in setUp()/tearDown(). Having both mechanisms fight over the global config
310+
# causes race conditions.
311+
if request.instance is not None and isinstance(request.instance, TestBase):
312+
yield
313+
return
314+
302315
if not test_files_directory.exists():
303316
raise ValueError(
304317
f"Cannot find test cache dir, expected it to be {test_files_directory!s}!",
305318
)
306319
_root_cache_directory = openml.config._root_cache_directory
307-
tmp_cache = test_files_directory / request.node.name
320+
tmp_cache = test_files_directory / request.node.nodeid.replace("/", ".").replace("::", ".")
308321
openml.config.set_root_cache_directory(tmp_cache)
309322
yield
310323
openml.config.set_root_cache_directory(_root_cache_directory)

tests/test_datasets/test_dataset_functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2005,4 +2005,4 @@ def test_get_dataset_parquet(requests_mock, test_files_directory, test_server_v1
20052005
assert dataset._parquet_url is not None
20062006
assert dataset.parquet_file is not None
20072007
assert os.path.isfile(dataset.parquet_file)
2008-
assert dataset.data_file is None # is alias for arff path
2008+
assert dataset.data_file is None # is alias for arff path

tests/test_flows/test_flow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import copy
66
import hashlib
77
import re
8+
import os
89
import time
910
from packaging.version import Version
1011
from unittest import mock
@@ -33,7 +34,6 @@
3334
from openml.testing import SimpleImputer, TestBase
3435

3536

36-
3737
class TestFlow(TestBase):
3838
_multiprocess_can_split_ = True
3939

0 commit comments

Comments
 (0)