-
Notifications
You must be signed in to change notification settings - Fork 31
Open
Description
spark-rapids-ml_nightly, run: 689
assert failure,
[2025-04-16T04:23:27.123Z] =================================== FAILURES ===================================
[2025-04-16T04:23:27.123Z] _ test_make_sparse_regression[100-None-True-1000-200-density1-True-5-bias1-0-False-float64] _
[2025-04-16T04:23:27.123Z] [gw2] linux -- Python 3.10.17 /root/miniconda3/bin/python3.10
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] dtype = 'float64', use_gpu = 'False', redundant_cols = '0'
[2025-04-16T04:23:27.124Z] logistic_regression = 'True', n_classes = '5'
[2025-04-16T04:23:27.124Z] bias = ['0.5', '1.5', '2.5', '3.5', '4.5'], density = ['0.05', '0.1', '0.2']
[2025-04-16T04:23:27.124Z] rows = '1000', cols = '200', density_curve = 'None', shuffle = 'True'
[2025-04-16T04:23:27.124Z] n_chunks = '100'
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] @pytest.mark.parametrize("dtype", ["float64"])
[2025-04-16T04:23:27.124Z] @pytest.mark.parametrize("use_gpu", ["True", "False"])
[2025-04-16T04:23:27.124Z] @pytest.mark.parametrize("redundant_cols", ["0", "2"])
[2025-04-16T04:23:27.124Z] @pytest.mark.parametrize(
[2025-04-16T04:23:27.124Z] "logistic_regression, n_classes, bias",
[2025-04-16T04:23:27.124Z] [
[2025-04-16T04:23:27.124Z] ("True", "2", "1.0"),
[2025-04-16T04:23:27.124Z] ("True", "5", ["0.5", "1.5", "2.5", "3.5", "4.5"]),
[2025-04-16T04:23:27.124Z] ("True", "15", "1.5"),
[2025-04-16T04:23:27.124Z] ("False", "0", "1.0"),
[2025-04-16T04:23:27.124Z] ],
[2025-04-16T04:23:27.124Z] )
[2025-04-16T04:23:27.124Z] @pytest.mark.parametrize(
[2025-04-16T04:23:27.124Z] "density",
[2025-04-16T04:23:27.124Z] ["0.25", ["0.05", "0.1", "0.2"]],
[2025-04-16T04:23:27.124Z] )
[2025-04-16T04:23:27.124Z] @pytest.mark.parametrize(
[2025-04-16T04:23:27.124Z] "rows, cols",
[2025-04-16T04:23:27.124Z] [("1000", "200"), pytest.param("10000", "1000", marks=pytest.mark.slow)],
[2025-04-16T04:23:27.124Z] )
[2025-04-16T04:23:27.124Z] @pytest.mark.parametrize(
[2025-04-16T04:23:27.124Z] "density_curve, shuffle",
[2025-04-16T04:23:27.124Z] [
[2025-04-16T04:23:27.124Z] ("None", "True"),
[2025-04-16T04:23:27.124Z] ("Linear", "False"),
[2025-04-16T04:23:27.124Z] ("Exponential", "False"),
[2025-04-16T04:23:27.124Z] pytest.param("Exponential", "True", marks=pytest.mark.slow),
[2025-04-16T04:23:27.124Z] ],
[2025-04-16T04:23:27.124Z] )
[2025-04-16T04:23:27.124Z] @pytest.mark.parametrize("n_chunks", ["100"])
[2025-04-16T04:23:27.124Z] def test_make_sparse_regression(
[2025-04-16T04:23:27.124Z] dtype: str,
[2025-04-16T04:23:27.124Z] use_gpu: str,
[2025-04-16T04:23:27.124Z] redundant_cols: str,
[2025-04-16T04:23:27.124Z] logistic_regression: str,
[2025-04-16T04:23:27.124Z] n_classes: str,
[2025-04-16T04:23:27.124Z] bias: Union[str, List[str]],
[2025-04-16T04:23:27.124Z] density: Union[str, List[str]],
[2025-04-16T04:23:27.124Z] rows: str,
[2025-04-16T04:23:27.124Z] cols: str,
[2025-04-16T04:23:27.124Z] density_curve: str,
[2025-04-16T04:23:27.124Z] shuffle: str,
[2025-04-16T04:23:27.124Z] n_chunks: str,
[2025-04-16T04:23:27.124Z] ) -> None:
[2025-04-16T04:23:27.124Z] input_args = [
[2025-04-16T04:23:27.124Z] "--num_rows",
[2025-04-16T04:23:27.124Z] rows,
[2025-04-16T04:23:27.124Z] "--num_cols",
[2025-04-16T04:23:27.124Z] cols,
[2025-04-16T04:23:27.124Z] "--dtype",
[2025-04-16T04:23:27.124Z] dtype,
[2025-04-16T04:23:27.124Z] "--output_dir",
[2025-04-16T04:23:27.124Z] "temp",
[2025-04-16T04:23:27.124Z] "--output_num_files",
[2025-04-16T04:23:27.124Z] "3",
[2025-04-16T04:23:27.124Z] "--n_informative",
[2025-04-16T04:23:27.124Z] "3",
[2025-04-16T04:23:27.124Z] "--n_classes",
[2025-04-16T04:23:27.124Z] n_classes,
[2025-04-16T04:23:27.124Z] "--noise",
[2025-04-16T04:23:27.124Z] "1.0",
[2025-04-16T04:23:27.124Z] "--random_state",
[2025-04-16T04:23:27.124Z] "0",
[2025-04-16T04:23:27.124Z] "--use_gpu",
[2025-04-16T04:23:27.124Z] use_gpu,
[2025-04-16T04:23:27.124Z] "--redundant_cols",
[2025-04-16T04:23:27.124Z] redundant_cols,
[2025-04-16T04:23:27.124Z] "--logistic_regression",
[2025-04-16T04:23:27.124Z] logistic_regression,
[2025-04-16T04:23:27.124Z] "--density_curve",
[2025-04-16T04:23:27.124Z] density_curve,
[2025-04-16T04:23:27.124Z] "--shuffle",
[2025-04-16T04:23:27.124Z] shuffle,
[2025-04-16T04:23:27.124Z] "--n_chunk",
[2025-04-16T04:23:27.124Z] n_chunks,
[2025-04-16T04:23:27.124Z] ]
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] # Add parameters with multiple value
[2025-04-16T04:23:27.124Z] input_args.append("--bias")
[2025-04-16T04:23:27.124Z] if isinstance(bias, List):
[2025-04-16T04:23:27.124Z] input_args.extend(bias)
[2025-04-16T04:23:27.124Z] else:
[2025-04-16T04:23:27.124Z] input_args.append(bias)
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] input_args.append("--density")
[2025-04-16T04:23:27.124Z] if isinstance(density, List):
[2025-04-16T04:23:27.124Z] input_args.extend(density)
[2025-04-16T04:23:27.124Z] else:
[2025-04-16T04:23:27.124Z] input_args.append(density)
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] row_num = int(rows)
[2025-04-16T04:23:27.124Z] col_num = int(cols)
[2025-04-16T04:23:27.124Z] n_classes_num = int(n_classes)
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] data_gen = SparseRegressionDataGen(input_args)
[2025-04-16T04:23:27.124Z] args = data_gen.args
[2025-04-16T04:23:27.124Z] assert args is not None
[2025-04-16T04:23:27.124Z] with WithSparkSession(args.spark_confs, shutdown=(not args.no_shutdown)) as spark:
[2025-04-16T04:23:27.124Z] df, _, c = data_gen.gen_dataframe_and_meta(spark)
[2025-04-16T04:23:27.124Z] assert df.rdd.getNumPartitions() == 3, "Unexpected number of partitions"
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] pdf: DataFrame = df.toPandas()
[2025-04-16T04:23:27.124Z] X = pdf.iloc[:, 0].to_numpy()
[2025-04-16T04:23:27.124Z] y = pdf.iloc[:, 1].to_numpy()
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] assert len(X) == row_num, "X row number mismatch"
[2025-04-16T04:23:27.124Z] for sparseVec in X:
[2025-04-16T04:23:27.124Z] assert sparseVec.size == col_num, "X col number mismatch"
[2025-04-16T04:23:27.124Z] assert y.shape == (row_num,), "y shape mismatch"
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] if logistic_regression == "False" or n_classes_num == 2:
[2025-04-16T04:23:27.124Z] assert c.shape == (col_num,), "coef shape mismatch"
[2025-04-16T04:23:27.124Z] assert np.count_nonzero(c) == 3, "Unexpected number of informative features"
[2025-04-16T04:23:27.124Z] else:
[2025-04-16T04:23:27.124Z] assert c.shape == (
[2025-04-16T04:23:27.124Z] col_num,
[2025-04-16T04:23:27.124Z] n_classes_num,
[2025-04-16T04:23:27.124Z] ), "coef shape mismatch"
[2025-04-16T04:23:27.124Z] assert (
[2025-04-16T04:23:27.124Z] np.count_nonzero(c) == 3 * n_classes_num
[2025-04-16T04:23:27.124Z] ), "Unexpected number of informative features"
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] X_np = np.array([r.toArray() for r in X])
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] if logistic_regression == "True":
[2025-04-16T04:23:27.124Z] # Test that X consists of only discrete label
[2025-04-16T04:23:27.124Z] possible_labels = range(n_classes_num)
[2025-04-16T04:23:27.124Z] for n in y:
[2025-04-16T04:23:27.124Z] found = False
[2025-04-16T04:23:27.124Z] for l in possible_labels:
[2025-04-16T04:23:27.124Z] if n == l:
[2025-04-16T04:23:27.124Z] found = True
[2025-04-16T04:23:27.124Z] break
[2025-04-16T04:23:27.124Z] assert found, "Invalid label"
[2025-04-16T04:23:27.124Z] else:
[2025-04-16T04:23:27.124Z] # Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
[2025-04-16T04:23:27.124Z] assert_almost_equal(np.std(y - np.dot(X_np, c)), 1.0, decimal=1)
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] # Check density match
[2025-04-16T04:23:27.124Z] count = np.count_nonzero(X_np)
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] total = row_num * col_num
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] # If there is no random shuffled redundant cols, we can check the total density
[2025-04-16T04:23:27.124Z] if redundant_cols == "0" and density_curve == "None":
[2025-04-16T04:23:27.124Z] if isinstance(density, List):
[2025-04-16T04:23:27.124Z] density_num = sum([float(d) for d in density]) / len(density)
[2025-04-16T04:23:27.124Z] else:
[2025-04-16T04:23:27.124Z] density_num = float(density)
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] assert (
[2025-04-16T04:23:27.124Z] count > total * density_num * 0.95
[2025-04-16T04:23:27.124Z] and count < total * density_num * 1.05
[2025-04-16T04:23:27.124Z] )
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] # If no shuffle, test to see if the chunk density is as specified/curved
[2025-04-16T04:23:27.124Z] n_chunks_num = int(n_chunks)
[2025-04-16T04:23:27.124Z] if shuffle == "False":
[2025-04-16T04:23:27.124Z] orig_cols = col_num - int(redundant_cols)
[2025-04-16T04:23:27.124Z] num_partitions = 3
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] if isinstance(density, List):
[2025-04-16T04:23:27.124Z] density_num = float(density[0])
[2025-04-16T04:23:27.124Z] else:
[2025-04-16T04:23:27.124Z] density_num = float(density)
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] if density_curve == "Linear":
[2025-04-16T04:23:27.124Z] density_values = np.linspace(
[2025-04-16T04:23:27.124Z] num_partitions / row_num, density_num, n_chunks_num
[2025-04-16T04:23:27.124Z] )
[2025-04-16T04:23:27.124Z] density_values *= n_chunks_num * density_num / sum(density_values)
[2025-04-16T04:23:27.124Z] else:
[2025-04-16T04:23:27.124Z] density_values = np.logspace(
[2025-04-16T04:23:27.124Z] np.log10(num_partitions / row_num),
[2025-04-16T04:23:27.124Z] np.log10(density_num),
[2025-04-16T04:23:27.124Z] n_chunks_num,
[2025-04-16T04:23:27.124Z] )
[2025-04-16T04:23:27.124Z] density_values *= n_chunks_num * density_num / sum(density_values)
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] for i in range(len(density_values)):
[2025-04-16T04:23:27.124Z] if density_values[i] > 1:
[2025-04-16T04:23:27.124Z] density_values[i] = 1
[2025-04-16T04:23:27.124Z]
[2025-04-16T04:23:27.124Z] col_per_chunk = np.full(n_chunks_num, orig_cols // n_chunks_num)
[2025-04-16T04:23:27.125Z] col_per_chunk[: (orig_cols % n_chunks_num)] += 1
[2025-04-16T04:23:27.125Z] chunk_boundary = np.cumsum(col_per_chunk)
[2025-04-16T04:23:27.125Z]
[2025-04-16T04:23:27.125Z] dense_count = 0
[2025-04-16T04:23:27.125Z]
[2025-04-16T04:23:27.125Z] for i in range(len(chunk_boundary)):
[2025-04-16T04:23:27.125Z] start = 0 if i == 0 else chunk_boundary[i - 1]
[2025-04-16T04:23:27.125Z] dense_count = np.count_nonzero(X_np[:, start : chunk_boundary[i]])
[2025-04-16T04:23:27.125Z]
[2025-04-16T04:23:27.125Z] col_density = density_values[i]
[2025-04-16T04:23:27.125Z] chunk_size = col_per_chunk[i]
[2025-04-16T04:23:27.125Z]
[2025-04-16T04:23:27.125Z] assert dense_count >= chunk_size * num_partitions * int(
[2025-04-16T04:23:27.125Z] (row_num // num_partitions) * col_density - 1
[2025-04-16T04:23:27.125Z] ) and dense_count <= chunk_size * num_partitions * int(
[2025-04-16T04:23:27.125Z] (row_num // num_partitions + 1) * col_density + 1
[2025-04-16T04:23:27.125Z] )
[2025-04-16T04:23:27.125Z]
[2025-04-16T04:23:27.125Z] # Check all clusters exists
[2025-04-16T04:23:27.125Z] if logistic_regression == "True":
[2025-04-16T04:23:27.125Z] > assert np.unique(y).shape[0] == n_classes_num
[2025-04-16T04:23:27.125Z] E assert 4 == 5
[2025-04-16T04:23:27.125Z]
[2025-04-16T04:23:27.125Z] benchmark/test_gen_data.py:427: AssertionError
[2025-04-16T04:23:27.125Z] ----------------------------- Captured stdout call -----------------------------
[2025-04-16T04:23:27.125Z] 25/04/16 04:18:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
[2025-04-16T04:23:27.125Z] 25/04/16 04:18:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
[2025-04-16T04:23:27.125Z] Passing {'n_informative': 3, 'bias': [0.5, 1.5, 2.5, 3.5, 4.5], 'noise': 1.0, 'shuffle': True, 'redundant_cols': 0, 'random_state': 0, 'density': [0.05, 0.1, 0.2], 'use_gpu': False, 'logistic_regression': True, 'density_curve': 'None', 'n_classes': 5, 'n_chunk': 100} to make_sparse_regression
[2025-04-16T04:23:27.125Z] stopping spark session
[2025-04-16T04:23:27.125Z] ----------------------------- Captured stderr call -----------------------------
[2025-04-16T04:23:27.125Z]
[Stage 0:> (0 + 1) / 3]
[Stage 0:===================> (1 + 1) / 3]
[Stage 0:=======================================> (2 + 1) / 3]
[2025-04-16T04:23:27.125Z] =============================== warnings summary ===============================
[2025-04-16T04:23:27.125Z] benchmark/test_gen_data.py: 616 warnings
[2025-04-16T04:23:27.125Z] /root/miniconda3/lib/python3.10/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
[2025-04-16T04:23:27.125Z] if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
[2025-04-16T04:23:27.125Z]
[2025-04-16T04:23:27.125Z] benchmark/test_gen_data.py: 616 warnings
[2025-04-16T04:23:27.125Z] /root/miniconda3/lib/python3.10/site-packages/pyspark/sql/pandas/utils.py:64: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
[2025-04-16T04:23:27.125Z] if LooseVersion(pyarrow.__version__) < LooseVersion(minimum_pyarrow_version):
[2025-04-16T04:23:27.125Z]
[2025-04-16T04:23:27.125Z] benchmark/test_gen_data.py: 96 warnings
[2025-04-16T04:23:27.125Z] /root/miniconda3/lib/python3.10/site-packages/pyspark/sql/pandas/conversion.py:114: UserWarning: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, failed by the reason below:
[2025-04-16T04:23:27.125Z] Unsupported type in conversion to Arrow: VectorUDT()
[2025-04-16T04:23:27.125Z] Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
[2025-04-16T04:23:27.125Z] warn(msg)
[2025-04-16T04:23:27.125Z]
[2025-04-16T04:23:27.125Z] -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
[2025-04-16T04:23:27.125Z] =========================== short test summary info ============================
[2025-04-16T04:23:27.125Z] FAILED benchmark/test_gen_data.py::test_make_sparse_regression[100-None-True-1000-200-density1-True-5-bias1-0-False-float64] - assert 4 == 5
[2025-04-16T04:23:27.125Z] ==== 1 failed, 145 passed, 160 skipped, 1328 warnings in 379.98s (0:06:19) =====
Metadata
Metadata
Assignees
Labels
No labels