[Describe] Align describe to new pandas version (#812)

yonishelach · web-flow · commit 1ca8e5e05959 · 2024-06-13T14:07:04.000+03:00
* [Describe] Align describe to new pandas version

* minor test fix

* update mlrun version

* add dask to requirements

* remove dask

* update numpy version

* debug

* debug

* debug

* remove dask tests

* remove debug code
diff --git a/describe/describe.py b/describe/describe.py
@@ -36,7 +36,7 @@
 )
 from mlrun.datastore import DataItem
 from mlrun.execution import MLClientCtx
-from mlrun.feature_store import FeatureSet, FeatureVector
+from mlrun.feature_store import FeatureSet
 from plotly.subplots import make_subplots
 
 pd.set_option("display.float_format", lambda x: "%.2f" % x)
@@ -234,24 +234,24 @@ def _create_features_histogram_artifacts(
     if label_column is not None and problem_type == "classification":
         all_labels = df[label_column].unique()
     visible = True
-    for (columnName, _) in df.iteritems():
-        if columnName == label_column:
+    for column_name in df.columns:
+        if column_name == label_column:
             continue
 
         if label_column is not None and problem_type == "classification":
             for label in all_labels:
                 sub_fig = go.Histogram(
                     histfunc="count",
-                    x=df.loc[df[label_column] == label][columnName],
+                    x=df.loc[df[label_column] == label][column_name],
                     name=str(label),
                     visible=visible,
                 )
-                figs[f"{columnName}@?@{label}"] = sub_fig
+                figs[f"{column_name}@?@{label}"] = sub_fig
         else:
-            sub_fig = go.Histogram(histfunc="count", x=df[columnName], visible=visible)
-            figs[f"{columnName}@?@{1}"] = sub_fig
+            sub_fig = go.Histogram(histfunc="count", x=df[column_name], visible=visible)
+            figs[f"{column_name}@?@{1}"] = sub_fig
         if visible:
-            first_feature_name = columnName
+            first_feature_name = column_name
         visible = False
 
     fig = go.Figure()
@@ -338,7 +338,7 @@ def _create_features_2d_scatter_artifacts(
     Create and log a scatter-2d artifact for each couple of features
     """
     features = [
-        columnName for (columnName, _) in df.iteritems() if columnName != label_column
+        column_name for column_name in df.columns if column_name != label_column
     ]
     max_feature_len = float(max(len(elem) for elem in features))
     if label_column is not None:
@@ -450,11 +450,12 @@ def _create_violin_artifact(
 
     plot_num = 0
 
-    for (columnName, columnData) in df.iteritems():
+    for column_name in df.columns:
+        column_data = df[column_name]
         violin = go.Violin(
-            x=[columnName] * columnData.shape[0],
-            y=columnData,
-            name=columnName,
+            x=[column_name] * column_data.shape[0],
+            y=column_data,
+            name=column_name,
         )
 
         fig.add_trace(
@@ -491,15 +492,15 @@ def _create_imbalance_artifact(
     """
     if label_column:
         if problem_type == "classification":
+            values_column = "count"
             labels_count = df[label_column].value_counts().sort_index()
             df_labels_count = pd.DataFrame(labels_count)
-            df_labels_count.rename(columns={label_column: "Total"}, inplace=True)
             df_labels_count[label_column] = labels_count.index
-            df_labels_count["weights"] = df_labels_count["Total"] / sum(
-                df_labels_count["Total"]
+            df_labels_count.rename(columns={"": values_column}, inplace=True)
+            df_labels_count[values_column] = df_labels_count[values_column] / sum(
+                df_labels_count[values_column]
             )
-
-            fig = px.pie(df_labels_count, names=label_column, values="Total")
+            fig = px.pie(df_labels_count, names=label_column, values=values_column)
         else:
             fig = px.histogram(
                 histfunc="count",
@@ -532,7 +533,7 @@ def _create_corr_artifact(
     """
     if label_column is not None:
         df = df.drop([label_column], axis=1)
-    tblcorr = df.corr()
+    tblcorr = df.corr(numeric_only=True)
     extra_data["correlation-matrix-csv"] = context.log_artifact(
         TableArtifact("correlation-matrix-csv", df=tblcorr, visible=True),
         local_path=f"{plots_dest}/correlation-matrix.csv",
diff --git a/describe/function.yaml b/describe/function.yaml
diff --git a/describe/item.yaml b/describe/item.yaml
@@ -11,7 +11,7 @@ labels:
   author: Davids
 maintainers: []
 marketplaceType: ''
-mlrunVersion: 1.4.1
+mlrunVersion: 1.6.0
 name: describe
 platformVersion: 3.5.3
 spec:
@@ -21,4 +21,4 @@ spec:
   kind: job
   requirements: []
 url: ''
-version: 1.2.0
+version: 1.3.0
diff --git a/describe/requirements.txt b/describe/requirements.txt
@@ -1,6 +1,5 @@
 scikit-learn~=1.0.2
 plotly~=5.16.1
 pytest~=7.0.1
-pandas~=1.3.5
 matplotlib~=3.5.1
 seaborn~=0.11.2
diff --git a/describe/test_describe.py b/describe/test_describe.py
@@ -271,79 +271,3 @@ def _create_data(n_samples, n_features, n_classes, n_informative, reg=False):
     df["timestamp"] = [pd.Timestamp("2022").now()] * n_samples
     df.to_parquet("artifacts/random_dataset.parquet")
     return df
-
-
-def _create_dask_func(uri):
-    dask_cluster_name = "dask-cluster"
-    dask_cluster = new_function(dask_cluster_name, kind="dask", image="mlrun/ml-models")
-    dask_cluster.spec.remote = False
-    dask_uri = uri
-    dask_cluster.export(dask_uri)
-
-
-def test_import_function_describe_dask():
-    dask_uri = "dask_func.yaml"
-    _create_dask_func(dask_uri)
-    describe_func = import_function("function.yaml")
-    is_test_passed = True
-    _create_data(n_samples=100, n_features=5, n_classes=3, n_informative=3)
-    describe_func.spec.command = "describe_dask.py"
-
-    try:
-        describe_run = describe_func.run(
-            name="task-describe",
-            handler="analyze",
-            inputs={"table": DATA_PATH},
-            params={
-                "label_column": "label",
-                "dask_function": dask_uri,
-                "dask_flag": True,
-            },
-            artifact_path=os.path.abspath("./artifacts"),
-            local=True,
-        )
-
-    except Exception as exception:
-        print(f"- The test failed - raised the following error:\n- {exception}")
-        is_test_passed = False
-    _validate_paths(
-        {
-            "imbalance.html",
-            "imbalance-weights-vec.csv",
-        }
-    )
-    assert is_test_passed
-
-
-def test_code_to_function_describe_dask():
-    dask_uri = "dask_func.yaml"
-    _create_dask_func(dask_uri)
-    describe_func = code_to_function(filename="describe.py", kind="local")
-    is_test_passed = True
-    _create_data(n_samples=100, n_features=5, n_classes=3, n_informative=3)
-    describe_func.spec.command = "describe_dask.py"
-
-    try:
-        describe_run = describe_func.run(
-            name="task-describe",
-            handler="analyze",
-            inputs={"table": DATA_PATH},
-            params={
-                "label_column": "label",
-                "dask_function": dask_uri,
-                "dask_flag": True,
-            },
-            artifact_path=os.path.abspath("./artifacts"),
-            local=True,
-        )
-
-    except Exception as exception:
-        print(f"- The test failed - raised the following error:\n- {exception}")
-        is_test_passed = False
-    _validate_paths(
-        {
-            "imbalance.html",
-            "imbalance-weights-vec.csv",
-        }
-    )
-    assert is_test_passed