[auto-merge] branch-25.08 to branch-25.10 [skip ci] [bot] (#971)

nvauto · web-flow · commit c11134130818 · 2025-08-08T08:00:48.000+08:00
auto-merge triggered by github actions on `branch-25.08` to create a PR
keeping `branch-25.10` up-to-date. If this PR is unable to be merged due
to conflicts, it will remain open until manually fix.
diff --git a/ci/Dockerfile b/ci/Dockerfile
@@ -47,6 +47,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     && conda config --set solver libmamba
 
 # install cuML
-ARG CUML_VER=25.06
-RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=12.0 numpy~=1.0 \
+ARG CUML_VER=25.08
+RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=12.0 numpy~=1.0 \
     && conda clean --all -f -y
diff --git a/python/run_test.sh b/python/run_test.sh
@@ -28,8 +28,10 @@ fi
 python -m spark_rapids_ml tests_no_import_change/test_no_import_change.py 0.2
 # runs on cpu
 python tests_no_import_change/test_no_import_change.py 0.2
-# runs on gpu with spark-submit (note: local[1] for spark-rapids-submit hangs probably due to barrier rdd timer threads. TBD root cause)
+# runs on gpu with spark-submit (note: local[1] and pyspark<3.5.6 for spark-rapids-submit hangs probably due to barrier rdd timer threads. TBD root cause)
+pip install pyspark==3.5.6
 spark-rapids-submit --master local-cluster[1,1,1024] tests_no_import_change/test_no_import_change.py 0.2
+pip install -r requirements_dev.txt
 # runs on cpu with spark-submit
 spark-submit --master local-cluster[1,1,1024] tests_no_import_change/test_no_import_change.py 0.2
 
diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py
@@ -1080,9 +1080,9 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]:
                     **init_parameters,
                 )
 
-                logistic_regression.penalty_normalized = False
-                logistic_regression.lbfgs_memory = 10
-                logistic_regression.linesearch_max_iter = 20
+                logistic_regression.solver_model.penalty_normalized = False
+                logistic_regression.solver_model.lbfgs_memory = 10
+                logistic_regression.solver_model.linesearch_max_iter = 20
 
                 if is_sparse and pdesc.partition_max_nnz > nnz_limit_for_int32:  # type: ignore
                     logistic_regression._convert_index = np.int64
diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py
@@ -113,7 +113,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]:
             "max_iter": 300,
             "tol": 0.0001,
             "verbose": False,
-            "random_state": 1,
+            "random_state": None,
             "init": "scalable-k-means++",
             "n_init": "auto",
             "oversampling_factor": 2.0,
@@ -506,7 +506,7 @@ def _construct_kmeans() -> CumlT:
         def _transform_internal(
             kmeans: CumlT, df: Union[pd.DataFrame, np.ndarray]
         ) -> pd.Series:
-            res = list(kmeans.predict(df, normalize_weights=False).to_numpy())
+            res = list(kmeans.predict(df).to_numpy())
             return pd.Series(res)
 
         return _construct_kmeans, _transform_internal, None
diff --git a/python/tests/test_approximate_nearest_neighbors.py b/python/tests/test_approximate_nearest_neighbors.py
@@ -85,6 +85,7 @@ def test_params(default_params: bool) -> None:
             "metric_expanded",
             "metric_params",
             "output_type",
+            "n_jobs",
         ],
     )
 
@@ -556,7 +557,11 @@ def assert_row_equal(r1: Row, r2: Row) -> None:
                         )
 
         assert len(reconstructed_collect) == len(knn_df_collect)
-        if algorithm != "ivfpq" and not (algorithm == "ivfflat" and algoParams == None):
+        if (
+            algorithm != "ivfpq"
+            and not (algorithm == "ivfflat" and algoParams == None)
+            and (not algoParams or algoParams.get("build_algo") != "ivf_pq")
+        ):
             # it is fine to skip ivfpq as long as other algorithms assert the same results of approxSimilarityJoin and kneighbors.
             # Also skip ivfflat when algoParams == None. Ivfflat probes only 1/50 of the clusters, leading to unstable results.
             # ivfpq shows non-deterministic distances due to kmeans initialization uses GPU memory runtime values.
diff --git a/python/tests/test_logistic_regression.py b/python/tests/test_logistic_regression.py
@@ -441,6 +441,7 @@ def _func_test_classifier(
     cu_lr = cuLR(fit_intercept=fit_intercept, penalty=penalty, C=C, l1_ratio=l1_ratio)
     cu_lr.solver_model.penalty_normalized = False
     cu_lr.solver_model.lbfgs_memory = 10
+    cu_lr.solver_model.linesearch_max_iter = 20
     cu_lr.fit(X_train, y_train)
 
     spark_conf.update(
@@ -490,7 +491,7 @@ def to_sparse_func(v: Union[SparseVector, DenseVector]) -> SparseVector:
         spark_lr_model: LogisticRegressionModel = spark_lr.fit(train_df)
 
         # test coefficients and intercepts
-        assert spark_lr_model.n_cols == cu_lr.n_cols
+        assert spark_lr_model.n_cols == cu_lr.n_features_in_
 
         # test float32_inputs
         assert spark_lr_model._float32_inputs == float32_inputs
diff --git a/python/tests/test_nearest_neighbors.py b/python/tests/test_nearest_neighbors.py
@@ -68,6 +68,7 @@ def test_params(default_params: bool, caplog: LogCaptureFixture) -> None:
             "metric_expanded",
             "metric_params",
             "output_type",
+            "n_jobs",
         ],
     )
     assert cuml_params == NearestNeighbors()._get_cuml_params_default()

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ def test_params(default_params: bool, caplog: LogCaptureFixture) -> None:`
`68`	`68`	`"metric_expanded",`
`69`	`69`	`"metric_params",`
`70`	`70`	`"output_type",`
	`71`	`+ "n_jobs",`
`71`	`72`	`],`
`72`	`73`	`)`
`73`	`74`	`assert cuml_params == NearestNeighbors()._get_cuml_params_default()`