Fix NearestNeighbors _ensureIdCol to check whether id_col in df.columns instead of relying on isSet(idCol) (#642)

lijinf2 · web-flow · commit 1788e9d3df8e · 2024-05-08T22:59:59.000-07:00
* fix ensureIdCol to avoid using isSet(idCol)

* simply the logic of ensureIdCol

* try set idCol to None

---------

Signed-off-by: Jinfeng &lt;jinfengl@nvidia.com&gt;
diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py
@@ -85,7 +85,7 @@ class _NearestNeighborsCumlParams(
 
     def __init__(self) -> None:
         super().__init__()
-        self._setDefault(idCol=alias.row_number)
+        self._setDefault(idCol=None)
 
     k = Param(
         Params._dummy(),
@@ -114,6 +114,16 @@ def getK(self: P) -> int:
         """
         return self.getOrDefault("k")
 
+    def _getIdColOrDefault(self) -> str:
+        """
+        Gets the value of `idCol`.
+        """
+
+        res = self.getIdCol()
+        if res is None:
+            res = alias.row_number
+        return res
+
     def setInputCol(self: P, value: Union[str, List[str]]) -> P:
         """
         Sets the value of :py:attr:`inputCol` or :py:attr:`inputCols`.
@@ -142,19 +152,32 @@ def _ensureIdCol(self, df: DataFrame) -> DataFrame:
         Ensure an id column exists in the input dataframe. Add the column if not exists.
         Overwritten for knn assumption on error for not setting idCol and duplicate exists.
         """
-        if not self.isSet("idCol") and self.getIdCol() in df.columns:
-            raise ValueError(
-                f"Cannot create a default id column since a column with the default name '{self.getIdCol()}' already exists."
-                + "Please specify an id column"
-            )
 
         id_col_name = self.getIdCol()
-        df_withid = (
-            df
-            if self.isSet("idCol")
-            else df.select(monotonically_increasing_id().alias(id_col_name), "*")
-        )
-        return df_withid
+        if id_col_name is None:
+            if alias.row_number in df.columns:
+                raise ValueError(
+                    f"Trying to create an id column with default name {alias.row_number}. But a column with the same name already exists."
+                )
+            else:
+                get_logger(self.__class__).info(
+                    f"idCol not set. Spark Rapids ML will create one with default name {alias.row_number}."
+                )
+                df_withid = df.select(
+                    monotonically_increasing_id().alias(alias.row_number), "*"
+                )
+                return df_withid
+        else:
+            if id_col_name in df.columns:
+                return df
+            else:
+                get_logger(self.__class__).info(
+                    f"column {id_col_name} does not exists in the input dataframe. Spark Rapids ML will create the {id_col_name} column."
+                )
+                df_withid = df.select(
+                    monotonically_increasing_id().alias(alias.row_number), "*"
+                )
+                return df_withid
 
 
 class NearestNeighbors(
@@ -179,7 +202,7 @@ class NearestNeighbors(
             * When the value is a string, the feature columns must be assembled into 1 column with vector or array type.
             * When the value is a list of strings, the feature columns must be numeric types.
 
-    idCol: str
+    idCol: str (default = None)
         the name of the column in a dataframe that uniquely identifies each vector. idCol should be set
         if such a column exists in the dataframe. If idCol is not set, a column with the name `unique_id`
         will be automatically added to the dataframe and used as unique identifier for each vector.
@@ -400,7 +423,7 @@ def exactNearestNeighborsJoin(
             where item_vector v1 is one of the k nearest neighbors of query_vector v2 and their distance is dist(v1, v2).
         """
 
-        id_col_name = self.getIdCol()
+        id_col_name = self._getIdColOrDefault()
 
         # call kneighbors then prepare return results
         (item_df_withid, query_df_withid, knn_df) = self.kneighbors(query_df)
@@ -471,7 +494,9 @@ def _out_schema(self) -> Union[StructType, str]:  # type: ignore
         return StructType(
             [
                 StructField(
-                    f"query_{self.getIdCol()}", ArrayType(LongType(), False), False
+                    f"query_{self._getIdColOrDefault()}",
+                    ArrayType(LongType(), False),
+                    False,
                 ),
                 StructField(
                     "indices", ArrayType(ArrayType(LongType(), False), False), False
@@ -509,11 +534,8 @@ def _pre_process_data(  # type: ignore
 
         select_cols.append(col(alias.label))
 
-        if self.hasParam("idCol") and self.isDefined("idCol"):
-            id_col_name = self.getOrDefault("idCol")
-            select_cols.append(col(id_col_name).alias(alias.row_number))
-        else:
-            select_cols.append(col(alias.row_number))
+        id_col_name = self._getIdColOrDefault()
+        select_cols.append(col(id_col_name).alias(alias.row_number))
 
         return select_cols, multi_col_names, dimension, feature_type
 
@@ -561,8 +583,8 @@ def kneighbors(self, query_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFra
         pipelinedrdd = self._call_cuml_fit_func(union_df, partially_collect=False)
         pipelinedrdd = pipelinedrdd.repartition(query_default_num_partitions)  # type: ignore
 
-        query_id_col_name = f"query_{self.getIdCol()}"
-        id_col_type = dict(union_df.dtypes)[self.getIdCol()]
+        query_id_col_name = f"query_{self._getIdColOrDefault()}"
+        id_col_type = dict(union_df.dtypes)[self._getIdColOrDefault()]
         knn_rdd = pipelinedrdd.flatMap(
             lambda row: list(
                 zip(row[query_id_col_name], row["indices"], row["distances"])
@@ -584,7 +606,7 @@ def _get_cuml_fit_func(
     ]:
         label_isdata = self._label_isdata
         label_isquery = self._label_isquery
-        id_col_name = self.getIdCol()
+        id_col_name = self._getIdColOrDefault()
 
         def _cuml_fit(
             dfs: FitInputType,
@@ -849,7 +871,7 @@ class ApproximateNearestNeighbors(
             * When the value is a string, the feature columns must be assembled into 1 column with vector or array type.
             * When the value is a list of strings, the feature columns must be numeric types.
 
-    idCol: str
+    idCol: str (default = None)
         the name of the column in a dataframe that uniquely identifies each vector. idCol should be set
         if such a column exists in the dataframe. If idCol is not set, a column with the name `unique_id`
         will be automatically added to the dataframe and used as unique identifier for each vector.
@@ -1037,9 +1059,7 @@ def __init__(
         self.bcast_qfeatures: Optional[Broadcast] = None
 
     def _out_schema(self) -> Union[StructType, str]:  # type: ignore
-        return (
-            f"query_{self.getIdCol()} long, indices array<long>, distances array<float>"
-        )
+        return f"query_{self._getIdColOrDefault()} long, indices array<long>, distances array<float>"
 
     def _pre_process_data(
         self, dataset: DataFrame
@@ -1049,9 +1069,8 @@ def _pre_process_data(
             dataset
         )
 
-        if self.hasParam("idCol") and self.isDefined("idCol"):
-            id_col_name = self.getOrDefault("idCol")
-            dataset = dataset.withColumnRenamed(id_col_name, alias.row_number)
+        id_col_name = self._getIdColOrDefault()
+        dataset = dataset.withColumnRenamed(id_col_name, alias.row_number)
 
         select_cols.append(alias.row_number)
 
@@ -1179,7 +1198,7 @@ def kneighbors(self, query_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFra
         )
         k = self.getK()
 
-        query_id_col_name = f"query_{self.getIdCol()}"
+        query_id_col_name = f"query_{self._getIdColOrDefault()}"
 
         ascending = False if self.getMetric() == "inner_product" else True
 
@@ -1221,7 +1240,7 @@ def _construct_sgnn() -> CumlT:
         row_number_col = alias.row_number
         input_col, input_cols = self._get_input_columns()
         assert input_col is not None or input_cols is not None
-        id_col_name = self.getIdCol()
+        id_col_name = self._getIdColOrDefault()
 
         bcast_qids = self.bcast_qids
         bcast_qfeatures = self.bcast_qfeatures
diff --git a/python/tests/test_approximate_nearest_neighbors.py b/python/tests/test_approximate_nearest_neighbors.py
@@ -229,7 +229,9 @@ def cal_avg_dist_gap(distances_ann: np.ndarray) -> float:
 
         ascending = False if metric == "inner_product" else True
         reconstructed_knn_df = reconstruct_knn_df(
-            knnjoin_df, row_identifier_col=knn_model.getIdCol(), ascending=ascending
+            knnjoin_df,
+            row_identifier_col=knn_model._getIdColOrDefault(),
+            ascending=ascending,
         )
         reconstructed_collect = reconstructed_knn_df.collect()
 
diff --git a/python/tests/test_nearest_neighbors.py b/python/tests/test_nearest_neighbors.py
@@ -234,6 +234,17 @@ def assert_knn_metadata_equal(knn_metadata: List[List[str]]) -> None:
                 assert knnjoin_queries[i]["features"] == query[i][0]
             assert knnjoin_queries[i]["metadata"] == query[i][1]
 
+        # Test fit(dataset, ParamMap) that copies existing estimator
+        # After copy, self.isSet("idCol") becomes true. But the added id column does not exist in the dataframe
+        paramMap = gpu_knn.extractParamMap()
+        gpu_model_v2 = gpu_knn.fit(data_df, paramMap)
+
+        assert gpu_knn.isSet("idCol") is False
+        assert gpu_model_v2.isSet("idCol") is True
+
+        (_, _, knn_df_v2) = gpu_model_v2.kneighbors(query_df)
+        assert knn_df_v2.collect() == knn_df.collect()
+
         return gpu_knn, gpu_model
 
 
@@ -432,7 +443,7 @@ def test_nearest_neighbors(
         knn_model.setIdCol(item_df_withid.dtypes[0][0])
         knnjoin_df = knn_model.exactNearestNeighborsJoin(query_df_withid)
         reconstructed_knn_df = reconstruct_knn_df(
-            knnjoin_df, row_identifier_col=knn_model.getIdCol()
+            knnjoin_df, row_identifier_col=knn_model._getIdColOrDefault()
         )
         assert reconstructed_knn_df.collect() == knn_df.collect()
 

Original file line number	Diff line number	Diff line change
`@@ -229,7 +229,9 @@ def cal_avg_dist_gap(distances_ann: np.ndarray) -> float:`
`229`	`229`
`230`	`230`	`ascending = False if metric == "inner_product" else True`
`231`	`231`	`reconstructed_knn_df = reconstruct_knn_df(`
`232`		`- knnjoin_df, row_identifier_col=knn_model.getIdCol(), ascending=ascending`
	`232`	`+ knnjoin_df,`
	`233`	`+ row_identifier_col=knn_model._getIdColOrDefault(),`
	`234`	`+ ascending=ascending,`
`233`	`235`	`)`
`234`	`236`	`reconstructed_collect = reconstructed_knn_df.collect()`
`235`	`237`