Skip to content

Commit ac6c27b

Browse files
committed
remove debug info
1 parent cf2f195 commit ac6c27b

File tree

1 file changed

+0
-81
lines changed

1 file changed

+0
-81
lines changed

python/tests_large/test_large_logistic_regression.py

Lines changed: 0 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -119,84 +119,3 @@ def get_nnz_func(pdf_iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]:
119119

120120
def test_sparse_int64_mg() -> None:
121121
test_sparse_int64(multi_gpus=True)
122-
123-
124-
def test_log_reg_sparsevector_int64_sparkrapidsml() -> None:
125-
elasticNetParam=1.0
126-
gpu_number = 2
127-
"""
128-
This test requires minimum 128G CPU memory, 32 GB GPU memory
129-
Only pass in x2 GPU, each one has >=32GB and CPU memory >=256GB environment
130-
"""
131-
# Configuration for the 2.2GB dataset
132-
output_num_files = 100 # large value smaller CPU memory for each spark task
133-
data_shape = (int(1e7), 2200)
134-
#data_shape = (int(1e5), 2200)
135-
136-
fraction_sampled_for_test = (
137-
1.0 if data_shape[0] <= 100000 else 100000 / data_shape[0]
138-
)
139-
n_classes = 8
140-
tolerance = 0.001
141-
est_params: Dict[str, Any] = {
142-
"regParam": 0.02,
143-
"maxIter": 10,
144-
"standardization": False, # reduce GPU memory since standardization copies the value array
145-
}
146-
density = 0.1
147-
148-
data_gen_args = [
149-
"--n_informative",
150-
f"{math.ceil(data_shape[1] / 3)}",
151-
"--num_rows",
152-
str(data_shape[0]),
153-
"--num_cols",
154-
str(data_shape[1]),
155-
"--output_num_files",
156-
str(output_num_files),
157-
"--dtype",
158-
"float32",
159-
"--feature_type",
160-
"vector",
161-
"--output_dir",
162-
"./temp",
163-
"--n_classes",
164-
str(n_classes),
165-
"--random_state",
166-
"0",
167-
"--logistic_regression",
168-
"True",
169-
"--density",
170-
str(density),
171-
"--use_gpu",
172-
"True",
173-
]
174-
175-
data_gen = SparseRegressionDataGen(data_gen_args)
176-
df, _, _ = data_gen.gen_dataframe_and_meta(_spark)
177-
178-
df = df.cache()
179-
df_gpu = df
180-
181-
if gpu_number > 1:
182-
main_pid = 0
183-
pid_col = "pid"
184-
delta_ratio = 0.1
185-
186-
delta_df = df.sample(fraction=delta_ratio, seed=0)
187-
188-
df = df.withColumn(pid_col, SparkF.lit(main_pid))
189-
delta_df = delta_df.withColumn(
190-
pid_col, SparkF.monotonically_increasing_id() % (gpu_number * 4)
191-
)
192-
193-
df = df.union(delta_df)
194-
df_gpu = df.repartition(gpu_number, pid_col)
195-
196-
df_test = df.sample(fraction=fraction_sampled_for_test, seed=0)
197-
numrows = df_test.count()
198-
print(f"finished with numrows {numrows}")
199-
200-
# Train the Logistic Regression model
201-
lr = LogisticRegression(num_workers=gpu_number, verbose=True, **est_params, elasticNetParam=elasticNetParam)
202-
# lr_model = lr.fit(df)

0 commit comments

Comments
 (0)