Skip to content

Commit 1ca8e5e

Browse files
authored
[Describe] Align describe to new pandas version (#812)
* [Describe] Align describe to new pandas version * minor test fix * update mlrun version * add dask to requirements * remove dask * update numpy version * debug * debug * debug * remove dask tests * remove debug code
1 parent d692c1a commit 1ca8e5e

File tree

5 files changed

+65
-151
lines changed

5 files changed

+65
-151
lines changed

describe/describe.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
)
3737
from mlrun.datastore import DataItem
3838
from mlrun.execution import MLClientCtx
39-
from mlrun.feature_store import FeatureSet, FeatureVector
39+
from mlrun.feature_store import FeatureSet
4040
from plotly.subplots import make_subplots
4141

4242
pd.set_option("display.float_format", lambda x: "%.2f" % x)
@@ -234,24 +234,24 @@ def _create_features_histogram_artifacts(
234234
if label_column is not None and problem_type == "classification":
235235
all_labels = df[label_column].unique()
236236
visible = True
237-
for (columnName, _) in df.iteritems():
238-
if columnName == label_column:
237+
for column_name in df.columns:
238+
if column_name == label_column:
239239
continue
240240

241241
if label_column is not None and problem_type == "classification":
242242
for label in all_labels:
243243
sub_fig = go.Histogram(
244244
histfunc="count",
245-
x=df.loc[df[label_column] == label][columnName],
245+
x=df.loc[df[label_column] == label][column_name],
246246
name=str(label),
247247
visible=visible,
248248
)
249-
figs[f"{columnName}@?@{label}"] = sub_fig
249+
figs[f"{column_name}@?@{label}"] = sub_fig
250250
else:
251-
sub_fig = go.Histogram(histfunc="count", x=df[columnName], visible=visible)
252-
figs[f"{columnName}@?@{1}"] = sub_fig
251+
sub_fig = go.Histogram(histfunc="count", x=df[column_name], visible=visible)
252+
figs[f"{column_name}@?@{1}"] = sub_fig
253253
if visible:
254-
first_feature_name = columnName
254+
first_feature_name = column_name
255255
visible = False
256256

257257
fig = go.Figure()
@@ -338,7 +338,7 @@ def _create_features_2d_scatter_artifacts(
338338
Create and log a scatter-2d artifact for each couple of features
339339
"""
340340
features = [
341-
columnName for (columnName, _) in df.iteritems() if columnName != label_column
341+
column_name for column_name in df.columns if column_name != label_column
342342
]
343343
max_feature_len = float(max(len(elem) for elem in features))
344344
if label_column is not None:
@@ -450,11 +450,12 @@ def _create_violin_artifact(
450450

451451
plot_num = 0
452452

453-
for (columnName, columnData) in df.iteritems():
453+
for column_name in df.columns:
454+
column_data = df[column_name]
454455
violin = go.Violin(
455-
x=[columnName] * columnData.shape[0],
456-
y=columnData,
457-
name=columnName,
456+
x=[column_name] * column_data.shape[0],
457+
y=column_data,
458+
name=column_name,
458459
)
459460

460461
fig.add_trace(
@@ -491,15 +492,15 @@ def _create_imbalance_artifact(
491492
"""
492493
if label_column:
493494
if problem_type == "classification":
495+
values_column = "count"
494496
labels_count = df[label_column].value_counts().sort_index()
495497
df_labels_count = pd.DataFrame(labels_count)
496-
df_labels_count.rename(columns={label_column: "Total"}, inplace=True)
497498
df_labels_count[label_column] = labels_count.index
498-
df_labels_count["weights"] = df_labels_count["Total"] / sum(
499-
df_labels_count["Total"]
499+
df_labels_count.rename(columns={"": values_column}, inplace=True)
500+
df_labels_count[values_column] = df_labels_count[values_column] / sum(
501+
df_labels_count[values_column]
500502
)
501-
502-
fig = px.pie(df_labels_count, names=label_column, values="Total")
503+
fig = px.pie(df_labels_count, names=label_column, values=values_column)
503504
else:
504505
fig = px.histogram(
505506
histfunc="count",
@@ -532,7 +533,7 @@ def _create_corr_artifact(
532533
"""
533534
if label_column is not None:
534535
df = df.drop([label_column], axis=1)
535-
tblcorr = df.corr()
536+
tblcorr = df.corr(numeric_only=True)
536537
extra_data["correlation-matrix-csv"] = context.log_artifact(
537538
TableArtifact("correlation-matrix-csv", df=tblcorr, visible=True),
538539
local_path=f"{plots_dest}/correlation-matrix.csv",

describe/function.yaml

Lines changed: 43 additions & 53 deletions
Large diffs are not rendered by default.

describe/item.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ labels:
1111
author: Davids
1212
maintainers: []
1313
marketplaceType: ''
14-
mlrunVersion: 1.4.1
14+
mlrunVersion: 1.6.0
1515
name: describe
1616
platformVersion: 3.5.3
1717
spec:
@@ -21,4 +21,4 @@ spec:
2121
kind: job
2222
requirements: []
2323
url: ''
24-
version: 1.2.0
24+
version: 1.3.0

describe/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
scikit-learn~=1.0.2
22
plotly~=5.16.1
33
pytest~=7.0.1
4-
pandas~=1.3.5
54
matplotlib~=3.5.1
65
seaborn~=0.11.2

describe/test_describe.py

Lines changed: 0 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -271,79 +271,3 @@ def _create_data(n_samples, n_features, n_classes, n_informative, reg=False):
271271
df["timestamp"] = [pd.Timestamp("2022").now()] * n_samples
272272
df.to_parquet("artifacts/random_dataset.parquet")
273273
return df
274-
275-
276-
def _create_dask_func(uri):
277-
dask_cluster_name = "dask-cluster"
278-
dask_cluster = new_function(dask_cluster_name, kind="dask", image="mlrun/ml-models")
279-
dask_cluster.spec.remote = False
280-
dask_uri = uri
281-
dask_cluster.export(dask_uri)
282-
283-
284-
def test_import_function_describe_dask():
285-
dask_uri = "dask_func.yaml"
286-
_create_dask_func(dask_uri)
287-
describe_func = import_function("function.yaml")
288-
is_test_passed = True
289-
_create_data(n_samples=100, n_features=5, n_classes=3, n_informative=3)
290-
describe_func.spec.command = "describe_dask.py"
291-
292-
try:
293-
describe_run = describe_func.run(
294-
name="task-describe",
295-
handler="analyze",
296-
inputs={"table": DATA_PATH},
297-
params={
298-
"label_column": "label",
299-
"dask_function": dask_uri,
300-
"dask_flag": True,
301-
},
302-
artifact_path=os.path.abspath("./artifacts"),
303-
local=True,
304-
)
305-
306-
except Exception as exception:
307-
print(f"- The test failed - raised the following error:\n- {exception}")
308-
is_test_passed = False
309-
_validate_paths(
310-
{
311-
"imbalance.html",
312-
"imbalance-weights-vec.csv",
313-
}
314-
)
315-
assert is_test_passed
316-
317-
318-
def test_code_to_function_describe_dask():
319-
dask_uri = "dask_func.yaml"
320-
_create_dask_func(dask_uri)
321-
describe_func = code_to_function(filename="describe.py", kind="local")
322-
is_test_passed = True
323-
_create_data(n_samples=100, n_features=5, n_classes=3, n_informative=3)
324-
describe_func.spec.command = "describe_dask.py"
325-
326-
try:
327-
describe_run = describe_func.run(
328-
name="task-describe",
329-
handler="analyze",
330-
inputs={"table": DATA_PATH},
331-
params={
332-
"label_column": "label",
333-
"dask_function": dask_uri,
334-
"dask_flag": True,
335-
},
336-
artifact_path=os.path.abspath("./artifacts"),
337-
local=True,
338-
)
339-
340-
except Exception as exception:
341-
print(f"- The test failed - raised the following error:\n- {exception}")
342-
is_test_passed = False
343-
_validate_paths(
344-
{
345-
"imbalance.html",
346-
"imbalance-weights-vec.csv",
347-
}
348-
)
349-
assert is_test_passed

0 commit comments

Comments
 (0)