Skip to content

Commit ff87c85

Browse files
tomerbvEyal-Danieli
authored andcommitted
Update scikit-learn to version 1.5 (mlrun#967)
* updated scikit-learn~=1.5 fixes and patches for new scikit-learn version changes in item.yaml and regenerate function.yaml * remove filename * remove numpy import * revert sklearn.metrics monkey patch fix _get_dataframe to handle list/dict before accessing artifact_url added feature name preservation logic in predict function * revert mlrun version * revert get_or_create_project * revert scikit-learn version * scikit-learn==1.5.2 mlrun v 1.10 * scikit-learn==1.4.2 * revert scikit-learn<1.4.0 * scikit-learn~=1.5 * mlrun 1.10 with scikit-learn<1.4.0 * scikit-learn strict v~=1.5.2 added skip for test_train in test_auto_trainer.py * revert sklearn_classifier.py changes change XGBRegressor to LGBMRegressor * added xgboost.XGBRegressor, xgboost.XGBClassifier and lightgbm.LGBMClassifier models to test
1 parent f4bd787 commit ff87c85

File tree

12 files changed

+148
-132
lines changed

12 files changed

+148
-132
lines changed

functions/src/auto_trainer/auto_trainer.py

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -67,30 +67,14 @@ def _get_dataframe(
6767
Classification tasks.
6868
:param drop_columns: str/int or a list of strings/ints that represent the column names/indices to drop.
6969
"""
70-
store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)
71-
72-
# Getting the dataset:
73-
if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
74-
label_columns = label_columns or dataset.meta.status.label_column
75-
context.logger.info(f"label columns: {label_columns}")
76-
# FeatureVector case:
77-
try:
78-
fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
79-
dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
80-
except AttributeError:
81-
# Leave here for backwards compatibility
82-
dataset = fs.get_offline_features(
83-
dataset.meta.uri, drop_columns=drop_columns
84-
).to_dataframe()
85-
86-
elif not label_columns:
87-
context.logger.info(
88-
"label_columns not provided, mandatory when dataset is not a FeatureVector"
89-
)
90-
raise ValueError
91-
92-
elif isinstance(dataset, (list, dict)):
70+
# Check if dataset is list/dict first (before trying to access artifact_url)
71+
if isinstance(dataset, (list, dict)):
9372
# list/dict case:
73+
if not label_columns:
74+
context.logger.info(
75+
"label_columns not provided, mandatory when dataset is not a FeatureVector"
76+
)
77+
raise ValueError
9478
dataset = pd.DataFrame(dataset)
9579
# Checking if drop_columns provided by integer type:
9680
if drop_columns:
@@ -103,17 +87,38 @@ def _get_dataframe(
10387
)
10488
raise ValueError
10589
dataset.drop(drop_columns, axis=1, inplace=True)
106-
10790
else:
108-
# simple URL case:
109-
dataset = dataset.as_df()
110-
if drop_columns:
111-
if all(col in dataset for col in drop_columns):
112-
dataset = dataset.drop(drop_columns, axis=1)
113-
else:
91+
# Dataset is a DataItem with artifact_url (URI or FeatureVector)
92+
store_uri_prefix, _ = mlrun.datastore.parse_store_uri(dataset.artifact_url)
93+
94+
# Getting the dataset:
95+
if mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix:
96+
label_columns = label_columns or dataset.meta.status.label_column
97+
context.logger.info(f"label columns: {label_columns}")
98+
# FeatureVector case:
99+
try:
100+
fv = mlrun.datastore.get_store_resource(dataset.artifact_url)
101+
dataset = fv.get_offline_features(drop_columns=drop_columns).to_dataframe()
102+
except AttributeError:
103+
# Leave here for backwards compatibility
104+
dataset = fs.get_offline_features(
105+
dataset.meta.uri, drop_columns=drop_columns
106+
).to_dataframe()
107+
else:
108+
# simple URL case:
109+
if not label_columns:
114110
context.logger.info(
115-
"not all of the columns to drop in the dataset, drop columns process skipped"
111+
"label_columns not provided, mandatory when dataset is not a FeatureVector"
116112
)
113+
raise ValueError
114+
dataset = dataset.as_df()
115+
if drop_columns:
116+
if all(col in dataset for col in drop_columns):
117+
dataset = dataset.drop(drop_columns, axis=1)
118+
else:
119+
context.logger.info(
120+
"not all of the columns to drop in the dataset, drop columns process skipped"
121+
)
117122

118123
return dataset, label_columns
119124

functions/src/auto_trainer/function.yaml

Lines changed: 38 additions & 38 deletions
Large diffs are not rendered by default.

functions/src/auto_trainer/item.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ labels:
1313
author: Iguazio
1414
maintainers: []
1515
marketplaceType: ''
16-
mlrunVersion: 1.7.0
16+
mlrunVersion: 1.10.0
1717
name: auto_trainer
1818
platformVersion: 3.5.0
1919
spec:
@@ -23,4 +23,4 @@ spec:
2323
kind: job
2424
requirements: []
2525
url: ''
26-
version: 1.8.0
26+
version: 1.9.0
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
pandas
2-
scikit-learn<1.4.0
2+
scikit-learn~=1.5.2
3+
lightgbm
34
xgboost<2.0.0
45
plotly

functions/src/auto_trainer/test_auto_trainer.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
("sklearn.linear_model.LinearRegression", "regression"),
3030
("sklearn.ensemble.RandomForestClassifier", "classification"),
3131
("xgboost.XGBRegressor", "regression"),
32+
("xgboost.XGBClassifier", "classification"),
33+
("lightgbm.LGBMRegressor", "regression"),
34+
("lightgbm.LGBMClassifier", "classification")
3235
]
3336

3437
REQUIRED_ENV_VARS = [
@@ -78,11 +81,15 @@ def _assert_train_handler(train_run):
7881

7982

8083
@pytest.mark.parametrize("model", MODELS)
84+
@pytest.mark.skipif(
85+
condition=not _validate_environment_variables(),
86+
reason="Project's environment variables are not set",
87+
)
8188
def test_train(model: Tuple[str, str]):
8289
dataset, label_columns = _get_dataset(model[1])
8390
is_test_passed = True
8491

85-
project = mlrun.new_project("auto-trainer-test", context="./")
92+
project = mlrun.get_or_create_project("auto-trainer-test", context="./")
8693
fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun")
8794

8895
train_run = None
@@ -119,7 +126,7 @@ def test_train_evaluate(model: Tuple[str, str]):
119126
dataset, label_columns = _get_dataset(model[1])
120127
is_test_passed = True
121128
# Importing function:
122-
project = mlrun.new_project("auto-trainer-test", context="./")
129+
project = mlrun.get_or_create_project("auto-trainer-test", context="./")
123130
fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun")
124131
temp_dir = tempfile.mkdtemp()
125132

@@ -172,7 +179,7 @@ def test_train_predict(model: Tuple[str, str]):
172179
df = pd.read_csv(dataset)
173180
sample = df.head().drop("labels", axis=1).values.tolist()
174181
# Importing function:
175-
project = mlrun.new_project("auto-trainer-test", context="./")
182+
project = mlrun.get_or_create_project("auto-trainer-test", context="./")
176183
fn = project.set_function("function.yaml", "train", kind="job", image="mlrun/mlrun")
177184
temp_dir = tempfile.mkdtemp()
178185

functions/src/describe/function.yaml

Lines changed: 41 additions & 41 deletions
Large diffs are not rendered by default.

functions/src/describe/item.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ labels:
1111
author: Iguazio
1212
maintainers: []
1313
marketplaceType: ''
14-
mlrunVersion: 1.7.0
14+
mlrunVersion: 1.10.0
1515
name: describe
1616
platformVersion: 3.5.3
1717
spec:
@@ -21,4 +21,4 @@ spec:
2121
kind: job
2222
requirements: []
2323
url: ''
24-
version: 1.4.0
24+
version: 1.5.0

functions/src/describe/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
scikit-learn~=1.0.2
1+
scikit-learn~=1.5.2
22
plotly~=5.23
33
pytest~=7.0.1
44
matplotlib~=3.5.1

functions/src/gen_class_data/function.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
metadata:
2-
categories:
3-
- data-generation
42
tag: ''
53
name: gen-class-data
4+
categories:
5+
- data-generation
6+
verbose: false
67
spec:
78
description: Create a binary classification sample dataset and save.
8-
default_handler: gen_class_data
99
entry_points:
1010
gen_class_data:
11+
lineno: 22
12+
has_varargs: false
1113
has_kwargs: false
1214
parameters:
1315
- name: context
@@ -48,7 +50,6 @@ spec:
4850
- name: sk_params
4951
doc: additional parameters for `sklearn.datasets.make_classification`
5052
default: {}
51-
lineno: 22
5253
doc: 'Create a binary classification sample dataset and save.
5354
5455
If no filename is given it will default to:
@@ -59,14 +60,13 @@ spec:
5960
Additional scikit-learn parameters can be set using **sk_params, please see
6061
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
6162
for more details.'
62-
has_varargs: false
6363
name: gen_class_data
64-
command: ''
65-
disable_auto_mount: false
66-
image: mlrun/mlrun
6764
build:
6865
origin_filename: ''
6966
functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgICAgaHR0cDovL3d3dy5hcGFjaGUub3JnL2xpY2Vuc2VzL0xJQ0VOU0UtMi4wCiMKIyBVbmxlc3MgcmVxdWlyZWQgYnkgYXBwbGljYWJsZSBsYXcgb3IgYWdyZWVkIHRvIGluIHdyaXRpbmcsIHNvZnR3YXJlCiMgZGlzdHJpYnV0ZWQgdW5kZXIgdGhlIExpY2Vuc2UgaXMgZGlzdHJpYnV0ZWQgb24gYW4gIkFTIElTIiBCQVNJUywKIyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KIyBTZWUgdGhlIExpY2Vuc2UgZm9yIHRoZSBzcGVjaWZpYyBsYW5ndWFnZSBnb3Zlcm5pbmcgcGVybWlzc2lvbnMgYW5kCiMgbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuCiMKaW1wb3J0IHBhbmRhcyBhcyBwZApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIExpc3QKZnJvbSBza2xlYXJuLmRhdGFzZXRzIGltcG9ydCBtYWtlX2NsYXNzaWZpY2F0aW9uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCgpkZWYgZ2VuX2NsYXNzX2RhdGEoCiAgICAgICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICAgICAgbl9zYW1wbGVzOiBpbnQsCiAgICAgICAgbV9mZWF0dXJlczogaW50LAogICAgICAgIGtfY2xhc3NlczogaW50LAogICAgICAgIGhlYWRlcjogT3B0aW9uYWxbTGlzdFtzdHJdXSwKICAgICAgICBsYWJlbF9jb2x1bW46IE9wdGlvbmFsW3N0cl0gPSAibGFiZWxzIiwKICAgICAgICB3ZWlnaHQ6IGZsb2F0ID0gMC41LAogICAgICAgIHJhbmRvbV9zdGF0ZTogaW50ID0gMSwKICAgICAgICBrZXk6IHN0ciA9ICJjbGFzc2lmaWVyLWRhdGEiLAogICAgICAgIGZpbGVfZXh0OiBzdHIgPSAicGFycXVldCIsCiAgICAgICAgc2tfcGFyYW1zPXt9Cik6CiAgICAiIiJDcmVhdGUgYSBiaW5hcnkgY2xhc3NpZmljYXRpb24gc2FtcGxlIGRhdGFzZXQgYW5kIHNhdmUuCiAgICBJZiBubyBmaWxlbmFtZSBpcyBnaXZlbiBpdCB3aWxsIGRlZmF1bHQgdG86CiAgICAic2ltZGF0YS17bl9zYW1wbGVzfVh7bV9mZWF0dXJlc30ucGFycXVldCIuCgogICAgQWRkaXRpb25hbCBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zLCBwbGVhc2Ugc2VlIGh0dHBzOi8vc2Npa2l0LWxlYXJuLm9yZy9zdGFibGUvbW9kdWxlcy9nZW5lcmF0ZWQvc2tsZWFybi5kYXRhc2V0cy5tYWtlX2NsYXNzaWZpY2F0aW9uLmh0bWwgZm9yIG1vcmUgZGV0YWlscy4KCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG5fc2FtcGxlczogICAgIG51bWJlciBvZiByb3dzL3NhbXBsZXMKICAgIDpwYXJhbSBtX2ZlYXR1cmVzOiAgICBudW1iZXIgb2YgY29scy9mZWF0dXJlcwogICAgOnBhcmFtIGtfY2xhc3NlczogICAgIG51bWJlciBvZiBjbGFzc2VzCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIGxhYmVsX2NvbHVtbjogIGNvbHVtbiBuYW1lIG9mIGdyb3VuZC10cnV0aCBzZXJpZXMKICAgIDpwYXJhbSB3ZWlnaHQ6ICAgICAgICBmcmFjdGlvbiBvZiBzYW1wbGUgbmVnYXRpdmUgdmFsdWUgKGdyb3VuZC10cnV0aD0wKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICA6cGFyYW0gZmlsZV9leHQ6ICAgICAgKHBxdCkgZXh0ZW5zaW9uIGZvciBwYXJxdWV0IGZpbGUKICAgIDpwYXJhbSBza19wYXJhbXM6ICAgICBhZGRpdGlvbmFsIHBhcmFtZXRlcnMgZm9yIGBza2xlYXJuLmRhdGFzZXRzLm1ha2VfY2xhc3NpZmljYXRpb25gCiAgICAiIiIKICAgIGZlYXR1cmVzLCBsYWJlbHMgPSBtYWtlX2NsYXNzaWZpY2F0aW9uKAogICAgICAgIG5fc2FtcGxlcz1uX3NhbXBsZXMsCiAgICAgICAgbl9mZWF0dXJlcz1tX2ZlYXR1cmVzLAogICAgICAgIHdlaWdodHM9d2VpZ2h0LAogICAgICAgIG5fY2xhc3Nlcz1rX2NsYXNzZXMsCiAgICAgICAgcmFuZG9tX3N0YXRlPXJhbmRvbV9zdGF0ZSwKICAgICAgICAqKnNrX3BhcmFtcykKCiAgICAjIG1ha2UgZGF0YWZyYW1lcywgYWRkIGNvbHVtbiBuYW1lcywgY29uY2F0ZW5hdGUgKFgsIHkpCiAgICBYID0gcGQuRGF0YUZyYW1lKGZlYXR1cmVzKQogICAgaWYgbm90IGhlYWRlcjoKICAgICAgICBYLmNvbHVtbnMgPSBbImZlYXRfIiArIHN0cih4KSBmb3IgeCBpbiByYW5nZShtX2ZlYXR1cmVzKV0KICAgIGVsc2U6CiAgICAgICAgWC5jb2x1bW5zID0gaGVhZGVyCgogICAgeSA9IHBkLkRhdGFGcmFtZShsYWJlbHMsIGNvbHVtbnM9W2xhYmVsX2NvbHVtbl0pCiAgICBkYXRhID0gcGQuY29uY2F0KFtYLCB5XSwgYXhpcz0xKQoKICAgIGNvbnRleHQubG9nX2RhdGFzZXQoa2V5LCBkZj1kYXRhLCBmb3JtYXQ9ZmlsZV9leHQsIGluZGV4PUZhbHNlKQo=
7067
code_origin: ''
68+
command: ''
69+
image: mlrun/mlrun
70+
default_handler: gen_class_data
71+
disable_auto_mount: false
7172
kind: job
72-
verbose: false

functions/src/gen_class_data/item.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ labels:
1111
author: Iguazio
1212
maintainers: []
1313
marketplaceType: ''
14-
mlrunVersion: 1.7.0
14+
mlrunVersion: 1.10.0
1515
name: gen_class_data
1616
platformVersion: 3.5.3
1717
spec:
@@ -21,4 +21,4 @@ spec:
2121
kind: job
2222
requirements: []
2323
url: ''
24-
version: 1.3.0
24+
version: 1.4.0

0 commit comments

Comments
 (0)