fix bug of release 25.02 [skip ci] (#867)

YanxuanLiu · web-flow · commit 34a7b3a80143 · 2025-03-10T09:05:24.000+08:00
additional changes of #857 Note: merge this PR with `Create a merge commit to merge`
diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py
@@ -1451,7 +1451,7 @@ def _predict(lr: CumlT, pdf: TransformInputType) -> pd.DataFrame:
 
                 data = {}
 
-                scores = lr.decision_function(pdf).T
+                scores = lr.decision_function(pdf)
                 assert isinstance(scores, cp.ndarray)
                 _num_classes = max(scores.shape[1] if len(scores.shape) == 2 else 2, 2)
 
diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -115,7 +115,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]:
             "verbose": False,
             "random_state": 1,
             "init": "scalable-k-means++",
-            "n_init": 1,
+            "n_init": "warn",  # See https://github.com/rapidsai/cuml/pull/6142 - this needs to be updated to "auto" for cuml 25.04
             "oversampling_factor": 2.0,
             "max_samples_per_batch": 32768,
         }
diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py
@@ -1467,7 +1467,9 @@ def saveImpl(self, path: str) -> None:
 
         spark = _get_spark_session()
 
-        def write_sparse_array(array: scipy.sparse.spmatrix, df_dir: str) -> None:
+        def write_sparse_array(
+            array: scipy.sparse.spmatrix, df_dir: str, mode: str
+        ) -> None:
             indptr_schema = StructType([StructField("indptr", IntegerType(), False)])
             indptr_df = spark.createDataFrame(
                 pd.DataFrame(array.indptr), schema=indptr_schema
@@ -1491,10 +1493,12 @@ def write_sparse_array(array: scipy.sparse.spmatrix, df_dir: str) -> None:
                 schema=indices_data_schema,
             )
 
-            indptr_df.write.parquet(os.path.join(df_dir, "indptr.parquet"))
-            indices_data_df.write.parquet(os.path.join(df_dir, "indices_data.parquet"))
+            indptr_df.write.parquet(os.path.join(df_dir, "indptr.parquet"), mode=mode)
+            indices_data_df.write.parquet(
+                os.path.join(df_dir, "indices_data.parquet"), mode=mode
+            )
 
-        def write_dense_array(array: np.ndarray, df_path: str) -> None:
+        def write_dense_array(array: np.ndarray, df_path: str, mode: str) -> None:
             assert (
                 spark.conf.get("spark.sql.execution.arrow.pyspark.enabled") == "true"
             ), "spark.sql.execution.arrow.pyspark.enabled must be set to true to persist array attributes"
@@ -1514,7 +1518,8 @@ def write_dense_array(array: np.ndarray, df_path: str) -> None:
                 ),
                 schema=schema,
             )
-            data_df.write.parquet(df_path)
+
+            data_df.write.parquet(df_path, mode=mode)
 
         DefaultParamsWriter.saveMetadata(
             self.instance,
@@ -1527,6 +1532,9 @@ def write_dense_array(array: np.ndarray, df_path: str) -> None:
             },
         )
 
+        # adhere to the overwrite() -> shouldOverWrite flag from the MLWriter
+        write_mode = "overwrite" if self.shouldOverwrite else "errorifexists"
+
         # get a copy, since we're going to modify the array attributes
         model_attributes = self.instance._get_model_attributes()
         assert model_attributes is not None
@@ -1538,12 +1546,12 @@ def write_dense_array(array: np.ndarray, df_path: str) -> None:
             array = model_attributes[key]
             if isinstance(array, scipy.sparse.csr_matrix):
                 df_dir = os.path.join(data_path, f"{key}csr")
-                write_sparse_array(array, df_dir)
+                write_sparse_array(array, df_dir, write_mode)
                 model_attributes[key] = df_dir
                 model_attributes[key + "shape"] = array.shape
             else:
                 df_path = os.path.join(data_path, f"{key}.parquet")
-                write_dense_array(array, df_path)
+                write_dense_array(array, df_path, write_mode)
                 model_attributes[key] = df_path
 
         metadata_file_path = os.path.join(data_path, "metadata.json")
diff --git a/python/tests/test_kmeans.py b/python/tests/test_kmeans.py
@@ -302,6 +302,7 @@ def test_kmeans_numeric_type(gpu_number: int, data_type: str) -> None:
         kmeans.fit(df)
 
 
+@pytest.mark.xfail
 @pytest.mark.parametrize("feature_type", pyspark_supported_feature_types)
 @pytest.mark.parametrize("data_shape", [(1000, 20)], ids=idfn)
 @pytest.mark.parametrize("data_type", cuml_supported_data_types)
@@ -322,7 +323,9 @@ def test_kmeans(
 
     n_rows = data_shape[0]
     n_cols = data_shape[1]
-    n_clusters = 8
+    n_clusters = 4
+    tol = 1.0e-20
+    seed = 42  # This does not guarantee deterministic centers in 25.02.
     cluster_std = 1.0
     tolerance = 0.001
 
@@ -333,7 +336,11 @@ def test_kmeans(
     from cuml import KMeans as cuKMeans
 
     cuml_kmeans = cuKMeans(
-        n_clusters=n_clusters, output_type="numpy", tol=1.0e-20, verbose=6
+        n_clusters=n_clusters,
+        output_type="numpy",
+        tol=tol,
+        random_state=seed,
+        verbose=6,
     )
 
     import cudf
@@ -348,7 +355,7 @@ def test_kmeans(
         )
 
         kmeans = KMeans(
-            num_workers=gpu_number, n_clusters=n_clusters, verbose=6
+            num_workers=gpu_number, n_clusters=n_clusters, tol=tol, seed=seed, verbose=6
         ).setFeaturesCol(features_col)
 
         kmeans_model = kmeans.fit(df)
diff --git a/python/tests/test_umap.py b/python/tests/test_umap.py
@@ -468,6 +468,11 @@ def test_umap_model_persistence(
         except Exception as e:
             assert re.search(r"Output directory .* already exists", str(e))
 
+        try:
+            umap_model.write().overwrite().save(model_path)
+        except:
+            assert False, "Overwriting should be permitted"
+
         # double check expected files/directories
         model_dir_contents = os.listdir(model_path)
         data_dir_contents = os.listdir(f"{model_path}/data")

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`#`
`2`		`-# Copyright (c) 2022-2024, NVIDIA CORPORATION.`
	`2`	`+# Copyright (c) 2022-2025, NVIDIA CORPORATION.`
`3`	`3`	`#`
`4`	`4`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`5`	`5`	`# you may not use this file except in compliance with the License.`
`@@ -115,7 +115,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]:`
`115`	`115`	`"verbose": False,`
`116`	`116`	`"random_state": 1,`
`117`	`117`	`"init": "scalable-k-means++",`
`118`		`- "n_init": 1,`
	`118`	`+ "n_init": "warn", # See https://github.com/rapidsai/cuml/pull/6142 - this needs to be updated to "auto" for cuml 25.04`
`119`	`119`	`"oversampling_factor": 2.0,`
`120`	`120`	`"max_samples_per_batch": 32768,`
`121`	`121`	`}`