Skip to content

[BUG] test_make_sparse_regression[100-None-True-1000-200-density1-True-5-bias1-0-False-float64] - assert 4 == 5 #892

@pxLi

Description

@pxLi

spark-rapids-ml_nightly, run: 689

assert failure,

[2025-04-16T04:23:27.123Z] =================================== FAILURES ===================================

[2025-04-16T04:23:27.123Z] _ test_make_sparse_regression[100-None-True-1000-200-density1-True-5-bias1-0-False-float64] _

[2025-04-16T04:23:27.123Z] [gw2] linux -- Python 3.10.17 /root/miniconda3/bin/python3.10

[2025-04-16T04:23:27.124Z] 

[2025-04-16T04:23:27.124Z] dtype = 'float64', use_gpu = 'False', redundant_cols = '0'

[2025-04-16T04:23:27.124Z] logistic_regression = 'True', n_classes = '5'

[2025-04-16T04:23:27.124Z] bias = ['0.5', '1.5', '2.5', '3.5', '4.5'], density = ['0.05', '0.1', '0.2']

[2025-04-16T04:23:27.124Z] rows = '1000', cols = '200', density_curve = 'None', shuffle = 'True'

[2025-04-16T04:23:27.124Z] n_chunks = '100'

[2025-04-16T04:23:27.124Z] 

[2025-04-16T04:23:27.124Z]     @pytest.mark.parametrize("dtype", ["float64"])

[2025-04-16T04:23:27.124Z]     @pytest.mark.parametrize("use_gpu", ["True", "False"])

[2025-04-16T04:23:27.124Z]     @pytest.mark.parametrize("redundant_cols", ["0", "2"])

[2025-04-16T04:23:27.124Z]     @pytest.mark.parametrize(

[2025-04-16T04:23:27.124Z]         "logistic_regression, n_classes, bias",

[2025-04-16T04:23:27.124Z]         [

[2025-04-16T04:23:27.124Z]             ("True", "2", "1.0"),

[2025-04-16T04:23:27.124Z]             ("True", "5", ["0.5", "1.5", "2.5", "3.5", "4.5"]),

[2025-04-16T04:23:27.124Z]             ("True", "15", "1.5"),

[2025-04-16T04:23:27.124Z]             ("False", "0", "1.0"),

[2025-04-16T04:23:27.124Z]         ],

[2025-04-16T04:23:27.124Z]     )

[2025-04-16T04:23:27.124Z]     @pytest.mark.parametrize(

[2025-04-16T04:23:27.124Z]         "density",

[2025-04-16T04:23:27.124Z]         ["0.25", ["0.05", "0.1", "0.2"]],

[2025-04-16T04:23:27.124Z]     )

[2025-04-16T04:23:27.124Z]     @pytest.mark.parametrize(

[2025-04-16T04:23:27.124Z]         "rows, cols",

[2025-04-16T04:23:27.124Z]         [("1000", "200"), pytest.param("10000", "1000", marks=pytest.mark.slow)],

[2025-04-16T04:23:27.124Z]     )

[2025-04-16T04:23:27.124Z]     @pytest.mark.parametrize(

[2025-04-16T04:23:27.124Z]         "density_curve, shuffle",

[2025-04-16T04:23:27.124Z]         [

[2025-04-16T04:23:27.124Z]             ("None", "True"),

[2025-04-16T04:23:27.124Z]             ("Linear", "False"),

[2025-04-16T04:23:27.124Z]             ("Exponential", "False"),

[2025-04-16T04:23:27.124Z]             pytest.param("Exponential", "True", marks=pytest.mark.slow),

[2025-04-16T04:23:27.124Z]         ],

[2025-04-16T04:23:27.124Z]     )

[2025-04-16T04:23:27.124Z]     @pytest.mark.parametrize("n_chunks", ["100"])

[2025-04-16T04:23:27.124Z]     def test_make_sparse_regression(

[2025-04-16T04:23:27.124Z]         dtype: str,

[2025-04-16T04:23:27.124Z]         use_gpu: str,

[2025-04-16T04:23:27.124Z]         redundant_cols: str,

[2025-04-16T04:23:27.124Z]         logistic_regression: str,

[2025-04-16T04:23:27.124Z]         n_classes: str,

[2025-04-16T04:23:27.124Z]         bias: Union[str, List[str]],

[2025-04-16T04:23:27.124Z]         density: Union[str, List[str]],

[2025-04-16T04:23:27.124Z]         rows: str,

[2025-04-16T04:23:27.124Z]         cols: str,

[2025-04-16T04:23:27.124Z]         density_curve: str,

[2025-04-16T04:23:27.124Z]         shuffle: str,

[2025-04-16T04:23:27.124Z]         n_chunks: str,

[2025-04-16T04:23:27.124Z]     ) -> None:

[2025-04-16T04:23:27.124Z]         input_args = [

[2025-04-16T04:23:27.124Z]             "--num_rows",

[2025-04-16T04:23:27.124Z]             rows,

[2025-04-16T04:23:27.124Z]             "--num_cols",

[2025-04-16T04:23:27.124Z]             cols,

[2025-04-16T04:23:27.124Z]             "--dtype",

[2025-04-16T04:23:27.124Z]             dtype,

[2025-04-16T04:23:27.124Z]             "--output_dir",

[2025-04-16T04:23:27.124Z]             "temp",

[2025-04-16T04:23:27.124Z]             "--output_num_files",

[2025-04-16T04:23:27.124Z]             "3",

[2025-04-16T04:23:27.124Z]             "--n_informative",

[2025-04-16T04:23:27.124Z]             "3",

[2025-04-16T04:23:27.124Z]             "--n_classes",

[2025-04-16T04:23:27.124Z]             n_classes,

[2025-04-16T04:23:27.124Z]             "--noise",

[2025-04-16T04:23:27.124Z]             "1.0",

[2025-04-16T04:23:27.124Z]             "--random_state",

[2025-04-16T04:23:27.124Z]             "0",

[2025-04-16T04:23:27.124Z]             "--use_gpu",

[2025-04-16T04:23:27.124Z]             use_gpu,

[2025-04-16T04:23:27.124Z]             "--redundant_cols",

[2025-04-16T04:23:27.124Z]             redundant_cols,

[2025-04-16T04:23:27.124Z]             "--logistic_regression",

[2025-04-16T04:23:27.124Z]             logistic_regression,

[2025-04-16T04:23:27.124Z]             "--density_curve",

[2025-04-16T04:23:27.124Z]             density_curve,

[2025-04-16T04:23:27.124Z]             "--shuffle",

[2025-04-16T04:23:27.124Z]             shuffle,

[2025-04-16T04:23:27.124Z]             "--n_chunk",

[2025-04-16T04:23:27.124Z]             n_chunks,

[2025-04-16T04:23:27.124Z]         ]

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]         # Add parameters with multiple value

[2025-04-16T04:23:27.124Z]         input_args.append("--bias")

[2025-04-16T04:23:27.124Z]         if isinstance(bias, List):

[2025-04-16T04:23:27.124Z]             input_args.extend(bias)

[2025-04-16T04:23:27.124Z]         else:

[2025-04-16T04:23:27.124Z]             input_args.append(bias)

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]         input_args.append("--density")

[2025-04-16T04:23:27.124Z]         if isinstance(density, List):

[2025-04-16T04:23:27.124Z]             input_args.extend(density)

[2025-04-16T04:23:27.124Z]         else:

[2025-04-16T04:23:27.124Z]             input_args.append(density)

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]         row_num = int(rows)

[2025-04-16T04:23:27.124Z]         col_num = int(cols)

[2025-04-16T04:23:27.124Z]         n_classes_num = int(n_classes)

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]         data_gen = SparseRegressionDataGen(input_args)

[2025-04-16T04:23:27.124Z]         args = data_gen.args

[2025-04-16T04:23:27.124Z]         assert args is not None

[2025-04-16T04:23:27.124Z]         with WithSparkSession(args.spark_confs, shutdown=(not args.no_shutdown)) as spark:

[2025-04-16T04:23:27.124Z]             df, _, c = data_gen.gen_dataframe_and_meta(spark)

[2025-04-16T04:23:27.124Z]             assert df.rdd.getNumPartitions() == 3, "Unexpected number of partitions"

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]             pdf: DataFrame = df.toPandas()

[2025-04-16T04:23:27.124Z]             X = pdf.iloc[:, 0].to_numpy()

[2025-04-16T04:23:27.124Z]             y = pdf.iloc[:, 1].to_numpy()

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]             assert len(X) == row_num, "X row number mismatch"

[2025-04-16T04:23:27.124Z]             for sparseVec in X:

[2025-04-16T04:23:27.124Z]                 assert sparseVec.size == col_num, "X col number mismatch"

[2025-04-16T04:23:27.124Z]             assert y.shape == (row_num,), "y shape mismatch"

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]             if logistic_regression == "False" or n_classes_num == 2:

[2025-04-16T04:23:27.124Z]                 assert c.shape == (col_num,), "coef shape mismatch"

[2025-04-16T04:23:27.124Z]                 assert np.count_nonzero(c) == 3, "Unexpected number of informative features"

[2025-04-16T04:23:27.124Z]             else:

[2025-04-16T04:23:27.124Z]                 assert c.shape == (

[2025-04-16T04:23:27.124Z]                     col_num,

[2025-04-16T04:23:27.124Z]                     n_classes_num,

[2025-04-16T04:23:27.124Z]                 ), "coef shape mismatch"

[2025-04-16T04:23:27.124Z]                 assert (

[2025-04-16T04:23:27.124Z]                     np.count_nonzero(c) == 3 * n_classes_num

[2025-04-16T04:23:27.124Z]                 ), "Unexpected number of informative features"

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]             X_np = np.array([r.toArray() for r in X])

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]             if logistic_regression == "True":

[2025-04-16T04:23:27.124Z]                 # Test that X consists of only discrete label

[2025-04-16T04:23:27.124Z]                 possible_labels = range(n_classes_num)

[2025-04-16T04:23:27.124Z]                 for n in y:

[2025-04-16T04:23:27.124Z]                     found = False

[2025-04-16T04:23:27.124Z]                     for l in possible_labels:

[2025-04-16T04:23:27.124Z]                         if n == l:

[2025-04-16T04:23:27.124Z]                             found = True

[2025-04-16T04:23:27.124Z]                             break

[2025-04-16T04:23:27.124Z]                     assert found, "Invalid label"

[2025-04-16T04:23:27.124Z]             else:

[2025-04-16T04:23:27.124Z]                 # Test that y ~= np.dot(X, c) + bias + N(0, 1.0).

[2025-04-16T04:23:27.124Z]                 assert_almost_equal(np.std(y - np.dot(X_np, c)), 1.0, decimal=1)

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]             # Check density match

[2025-04-16T04:23:27.124Z]             count = np.count_nonzero(X_np)

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]             total = row_num * col_num

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]             # If there is no random shuffled redundant cols, we can check the total density

[2025-04-16T04:23:27.124Z]             if redundant_cols == "0" and density_curve == "None":

[2025-04-16T04:23:27.124Z]                 if isinstance(density, List):

[2025-04-16T04:23:27.124Z]                     density_num = sum([float(d) for d in density]) / len(density)

[2025-04-16T04:23:27.124Z]                 else:

[2025-04-16T04:23:27.124Z]                     density_num = float(density)

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]                 assert (

[2025-04-16T04:23:27.124Z]                     count > total * density_num * 0.95

[2025-04-16T04:23:27.124Z]                     and count < total * density_num * 1.05

[2025-04-16T04:23:27.124Z]                 )

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]             # If no shuffle, test to see if the chunk density is as specified/curved

[2025-04-16T04:23:27.124Z]             n_chunks_num = int(n_chunks)

[2025-04-16T04:23:27.124Z]             if shuffle == "False":

[2025-04-16T04:23:27.124Z]                 orig_cols = col_num - int(redundant_cols)

[2025-04-16T04:23:27.124Z]                 num_partitions = 3

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]                 if isinstance(density, List):

[2025-04-16T04:23:27.124Z]                     density_num = float(density[0])

[2025-04-16T04:23:27.124Z]                 else:

[2025-04-16T04:23:27.124Z]                     density_num = float(density)

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]                 if density_curve == "Linear":

[2025-04-16T04:23:27.124Z]                     density_values = np.linspace(

[2025-04-16T04:23:27.124Z]                         num_partitions / row_num, density_num, n_chunks_num

[2025-04-16T04:23:27.124Z]                     )

[2025-04-16T04:23:27.124Z]                     density_values *= n_chunks_num * density_num / sum(density_values)

[2025-04-16T04:23:27.124Z]                 else:

[2025-04-16T04:23:27.124Z]                     density_values = np.logspace(

[2025-04-16T04:23:27.124Z]                         np.log10(num_partitions / row_num),

[2025-04-16T04:23:27.124Z]                         np.log10(density_num),

[2025-04-16T04:23:27.124Z]                         n_chunks_num,

[2025-04-16T04:23:27.124Z]                     )

[2025-04-16T04:23:27.124Z]                     density_values *= n_chunks_num * density_num / sum(density_values)

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]                 for i in range(len(density_values)):

[2025-04-16T04:23:27.124Z]                     if density_values[i] > 1:

[2025-04-16T04:23:27.124Z]                         density_values[i] = 1

[2025-04-16T04:23:27.124Z]     

[2025-04-16T04:23:27.124Z]                 col_per_chunk = np.full(n_chunks_num, orig_cols // n_chunks_num)

[2025-04-16T04:23:27.125Z]                 col_per_chunk[: (orig_cols % n_chunks_num)] += 1

[2025-04-16T04:23:27.125Z]                 chunk_boundary = np.cumsum(col_per_chunk)

[2025-04-16T04:23:27.125Z]     

[2025-04-16T04:23:27.125Z]                 dense_count = 0

[2025-04-16T04:23:27.125Z]     

[2025-04-16T04:23:27.125Z]                 for i in range(len(chunk_boundary)):

[2025-04-16T04:23:27.125Z]                     start = 0 if i == 0 else chunk_boundary[i - 1]

[2025-04-16T04:23:27.125Z]                     dense_count = np.count_nonzero(X_np[:, start : chunk_boundary[i]])

[2025-04-16T04:23:27.125Z]     

[2025-04-16T04:23:27.125Z]                     col_density = density_values[i]

[2025-04-16T04:23:27.125Z]                     chunk_size = col_per_chunk[i]

[2025-04-16T04:23:27.125Z]     

[2025-04-16T04:23:27.125Z]                     assert dense_count >= chunk_size * num_partitions * int(

[2025-04-16T04:23:27.125Z]                         (row_num // num_partitions) * col_density - 1

[2025-04-16T04:23:27.125Z]                     ) and dense_count <= chunk_size * num_partitions * int(

[2025-04-16T04:23:27.125Z]                         (row_num // num_partitions + 1) * col_density + 1

[2025-04-16T04:23:27.125Z]                     )

[2025-04-16T04:23:27.125Z]     

[2025-04-16T04:23:27.125Z]             # Check all clusters exists

[2025-04-16T04:23:27.125Z]             if logistic_regression == "True":

[2025-04-16T04:23:27.125Z] >               assert np.unique(y).shape[0] == n_classes_num

[2025-04-16T04:23:27.125Z] E               assert 4 == 5

[2025-04-16T04:23:27.125Z] 

[2025-04-16T04:23:27.125Z] benchmark/test_gen_data.py:427: AssertionError

[2025-04-16T04:23:27.125Z] ----------------------------- Captured stdout call -----------------------------

[2025-04-16T04:23:27.125Z] 25/04/16 04:18:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.

[2025-04-16T04:23:27.125Z] 25/04/16 04:18:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.

[2025-04-16T04:23:27.125Z] Passing {'n_informative': 3, 'bias': [0.5, 1.5, 2.5, 3.5, 4.5], 'noise': 1.0, 'shuffle': True, 'redundant_cols': 0, 'random_state': 0, 'density': [0.05, 0.1, 0.2], 'use_gpu': False, 'logistic_regression': True, 'density_curve': 'None', 'n_classes': 5, 'n_chunk': 100} to make_sparse_regression

[2025-04-16T04:23:27.125Z] stopping spark session

[2025-04-16T04:23:27.125Z] ----------------------------- Captured stderr call -----------------------------

[2025-04-16T04:23:27.125Z] 
[Stage 0:>                                                          (0 + 1) / 3]

[Stage 0:===================>                                       (1 + 1) / 3]

[Stage 0:=======================================>                   (2 + 1) / 3]

                                                                                

[2025-04-16T04:23:27.125Z] =============================== warnings summary ===============================
[2025-04-16T04:23:27.125Z] benchmark/test_gen_data.py: 616 warnings
[2025-04-16T04:23:27.125Z]   /root/miniconda3/lib/python3.10/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
[2025-04-16T04:23:27.125Z]     if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
[2025-04-16T04:23:27.125Z] 
[2025-04-16T04:23:27.125Z] benchmark/test_gen_data.py: 616 warnings
[2025-04-16T04:23:27.125Z]   /root/miniconda3/lib/python3.10/site-packages/pyspark/sql/pandas/utils.py:64: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
[2025-04-16T04:23:27.125Z]     if LooseVersion(pyarrow.__version__) < LooseVersion(minimum_pyarrow_version):
[2025-04-16T04:23:27.125Z] 
[2025-04-16T04:23:27.125Z] benchmark/test_gen_data.py: 96 warnings
[2025-04-16T04:23:27.125Z]   /root/miniconda3/lib/python3.10/site-packages/pyspark/sql/pandas/conversion.py:114: UserWarning: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, failed by the reason below:
[2025-04-16T04:23:27.125Z]     Unsupported type in conversion to Arrow: VectorUDT()
[2025-04-16T04:23:27.125Z]   Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
[2025-04-16T04:23:27.125Z]     warn(msg)
[2025-04-16T04:23:27.125Z] 
[2025-04-16T04:23:27.125Z] -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
[2025-04-16T04:23:27.125Z] =========================== short test summary info ============================
[2025-04-16T04:23:27.125Z] FAILED benchmark/test_gen_data.py::test_make_sparse_regression[100-None-True-1000-200-density1-True-5-bias1-0-False-float64] - assert 4 == 5
[2025-04-16T04:23:27.125Z] ==== 1 failed, 145 passed, 160 skipped, 1328 warnings in 379.98s (0:06:19) =====

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions