Skip to content

Commit b6b528b

Browse files
authored
Remove diabetes dataset references and switch tests to synthetic regression data (#1361)
1 parent 36712d6 commit b6b528b

File tree

4 files changed

+39
-39
lines changed

4 files changed

+39
-39
lines changed

ads/dataset/dataset_browser.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,27 @@
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77

8-
from __future__ import print_function, absolute_import
8+
from __future__ import absolute_import, print_function
99

10-
import re, pathlib, os
10+
import os
11+
import pathlib
12+
import re
1113
import urllib.parse
1214
from abc import ABC, abstractmethod
1315
from os import listdir
14-
from os.path import isfile, isdir, join, getsize
15-
from typing import List, Set, Tuple, Dict
16-
17-
import requests
16+
from os.path import getsize, isdir, isfile, join
17+
from typing import Dict, List, Set, Tuple
1818

1919
import pandas as pd
20+
import requests
2021
import sklearn.datasets as sk_datasets
2122

22-
from ads.dataset import helper
23-
from ads.common.utils import inject_and_copy_kwargs
2423
from ads.common.decorator.runtime_dependency import (
25-
runtime_dependency,
2624
OptionalDependency,
25+
runtime_dependency,
2726
)
27+
from ads.common.utils import inject_and_copy_kwargs
28+
from ads.dataset import helper
2829

2930

3031
class DatasetBrowser(ABC):
@@ -318,7 +319,7 @@ def open(self, name: str, **kwargs):
318319

319320
class SklearnDatasets(DatasetBrowser):
320321

321-
sklearn_datasets = ["breast_cancer", "diabetes", "iris", "wine", "digits"]
322+
sklearn_datasets = ["breast_cancer", "iris", "wine", "digits"]
322323

323324
def __init__(self):
324325
super(DatasetBrowser, self).__init__()

docs/source/user_guide/loading_data/connect_legacy.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ To see which dataset is available from scikit-learn, use:
327327
328328
.. parsed-literal::
329329
330-
['boston', 'breast_cancer', 'diabetes', 'iris', 'wine', 'digits']
330+
['boston', 'breast_cancer', 'iris', 'wine', 'digits']
331331
332332
Datasets are provided as a convenience. Datasets are considered Third Party Content and are not considered Materials under Your agreement with Oracle applicable to the Services. Review the `dataset license <https://github.com/scikit-learn/scikit-learn/blob/master/COPYING>`__.
333333

@@ -336,4 +336,3 @@ To explore one of the datasets, use ``open()`` specifying the name of the datase
336336
.. code-block:: python3
337337
338338
ds = sklearn.open('wine')
339-

tests/integration/opctl/opctl_tests_files/linear_reg_test/main.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,23 @@
11
#!/usr/bin/env python
22

3-
# Copyright (c) 2023 Oracle and/or its affiliates.
3+
# Copyright (c) 2023, 2025 Oracle and/or its affiliates.
44
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
55

66
import argparse
7-
import numpy as np
8-
from sklearn import datasets, linear_model
7+
8+
from sklearn import linear_model
9+
from sklearn.datasets import make_regression
910
from sklearn.metrics import mean_squared_error, r2_score
1011
from sklearn.model_selection import train_test_split
1112

1213

1314
def main(test_size):
14-
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)
15-
diabetes_X = diabetes_X[:, np.newaxis, 2]
15+
X, y = make_regression(
16+
n_samples=442, n_features=1, n_informative=1, noise=10.0, random_state=42
17+
)
1618

1719
X_train, X_test, y_train, y_test = train_test_split(
18-
diabetes_X, diabetes_y, test_size=test_size
20+
X, y, test_size=test_size
1921
)
2022

2123
# Create linear regression object
@@ -25,14 +27,14 @@ def main(test_size):
2527
regr.fit(X_train, y_train)
2628

2729
# Make predictions using the testing set
28-
diabetes_y_pred = regr.predict(X_test)
30+
y_pred = regr.predict(X_test)
2931

3032
# The coefficients
3133
print("Coefficients: \n", regr.coef_)
3234
# The mean squared error
33-
print("Mean squared error: %.2f" % mean_squared_error(y_test, diabetes_y_pred))
35+
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
3436
# The coefficient of determination: 1 is perfect prediction
35-
print("Coefficient of determination: %.2f" % r2_score(y_test, diabetes_y_pred))
37+
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
3638

3739

3840
if __name__ == "__main__":

tests/unitary/with_extras/model/test_model_metadata_mixin.py

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
#!/usr/bin/env python
22

3-
# Copyright (c) 2022, 2023 Oracle and/or its affiliates.
3+
# Copyright (c) 2022, 2025 Oracle and/or its affiliates.
44
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
55

66
import os
77
import shutil
88
from unittest.mock import patch
99

10-
import numpy as np
1110
import pytest
1211
import sklearn
1312
import xgboost
14-
from sklearn import datasets, linear_model
13+
from sklearn import linear_model
14+
from sklearn.datasets import make_regression
1515

1616
from ads.feature_engineering.schema import Schema
1717
from ads.model.framework.sklearn_model import SklearnModel
@@ -22,19 +22,17 @@
2222

2323
class TestMetadataMixin:
2424
def setup_method(cls):
25-
# Load the diabetes dataset
26-
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)
27-
28-
# Use only one feature
29-
diabetes_X = diabetes_X[:, np.newaxis, 2]
25+
X, y = make_regression(
26+
n_samples=442, n_features=1, n_informative=1, noise=10.0, random_state=42
27+
)
3028

3129
# Split the data into training/testing sets
32-
cls.diabetes_X_train = diabetes_X[:-20]
33-
cls.diabetes_X_test = diabetes_X[-20:]
30+
cls.X_train = X[:-20]
31+
cls.X_test = X[-20:]
3432

3533
# Split the targets into training/testing sets
36-
cls.diabetes_y_train = diabetes_y[:-20]
37-
cls.diabetes_y_test = diabetes_y[-20:]
34+
cls.y_train = y[:-20]
35+
cls.y_test = y[-20:]
3836

3937
# Create linear regression object
4038
regr = linear_model.LinearRegression()
@@ -43,8 +41,8 @@ def setup_method(cls):
4341

4442
xgb_regr = XGBRegressor()
4543
# Train the model using the training sets
46-
cls.rgr = regr.fit(cls.diabetes_X_train, cls.diabetes_y_train)
47-
cls.xgb_rgr = xgb_regr.fit(cls.diabetes_X_train, cls.diabetes_y_train)
44+
cls.rgr = regr.fit(cls.X_train, cls.y_train)
45+
cls.xgb_rgr = xgb_regr.fit(cls.X_train, cls.y_train)
4846

4947
def test_metadata_generic_model(self):
5048
model = GenericModel(self.rgr, artifact_dir="~/test_generic")
@@ -132,8 +130,8 @@ def test_metadata_sklearn_model(self, mock_get_service_packs):
132130
)
133131
model.populate_metadata(
134132
use_case_type="other",
135-
X_sample=self.diabetes_X_test,
136-
y_sample=self.diabetes_y_test,
133+
X_sample=self.X_test,
134+
y_sample=self.y_test,
137135
)
138136

139137
assert model.metadata_custom.get("ModelSerializationFormat").value == "joblib"
@@ -185,8 +183,8 @@ def test_metadata_xgboost_model(self, mock_get_service_packs):
185183
)
186184
model.populate_metadata(
187185
use_case_type="binary_classification",
188-
X_sample=self.diabetes_X_test,
189-
y_sample=self.diabetes_y_test,
186+
X_sample=self.X_test,
187+
y_sample=self.y_test,
190188
)
191189
assert (
192190
model.metadata_custom.get("CondaEnvironment").value

0 commit comments

Comments
 (0)