@@ -119,84 +119,3 @@ def get_nnz_func(pdf_iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]:
119119
120120def test_sparse_int64_mg () -> None :
121121 test_sparse_int64 (multi_gpus = True )
122-
123-
124- def test_log_reg_sparsevector_int64_sparkrapidsml () -> None :
125- elasticNetParam = 1.0
126- gpu_number = 2
127- """
128- This test requires minimum 128G CPU memory, 32 GB GPU memory
129- Only pass in x2 GPU, each one has >=32GB and CPU memory >=256GB environment
130- """
131- # Configuration for the 2.2GB dataset
132- output_num_files = 100 # large value smaller CPU memory for each spark task
133- data_shape = (int (1e7 ), 2200 )
134- #data_shape = (int(1e5), 2200)
135-
136- fraction_sampled_for_test = (
137- 1.0 if data_shape [0 ] <= 100000 else 100000 / data_shape [0 ]
138- )
139- n_classes = 8
140- tolerance = 0.001
141- est_params : Dict [str , Any ] = {
142- "regParam" : 0.02 ,
143- "maxIter" : 10 ,
144- "standardization" : False , # reduce GPU memory since standardization copies the value array
145- }
146- density = 0.1
147-
148- data_gen_args = [
149- "--n_informative" ,
150- f"{ math .ceil (data_shape [1 ] / 3 )} " ,
151- "--num_rows" ,
152- str (data_shape [0 ]),
153- "--num_cols" ,
154- str (data_shape [1 ]),
155- "--output_num_files" ,
156- str (output_num_files ),
157- "--dtype" ,
158- "float32" ,
159- "--feature_type" ,
160- "vector" ,
161- "--output_dir" ,
162- "./temp" ,
163- "--n_classes" ,
164- str (n_classes ),
165- "--random_state" ,
166- "0" ,
167- "--logistic_regression" ,
168- "True" ,
169- "--density" ,
170- str (density ),
171- "--use_gpu" ,
172- "True" ,
173- ]
174-
175- data_gen = SparseRegressionDataGen (data_gen_args )
176- df , _ , _ = data_gen .gen_dataframe_and_meta (_spark )
177-
178- df = df .cache ()
179- df_gpu = df
180-
181- if gpu_number > 1 :
182- main_pid = 0
183- pid_col = "pid"
184- delta_ratio = 0.1
185-
186- delta_df = df .sample (fraction = delta_ratio , seed = 0 )
187-
188- df = df .withColumn (pid_col , SparkF .lit (main_pid ))
189- delta_df = delta_df .withColumn (
190- pid_col , SparkF .monotonically_increasing_id () % (gpu_number * 4 )
191- )
192-
193- df = df .union (delta_df )
194- df_gpu = df .repartition (gpu_number , pid_col )
195-
196- df_test = df .sample (fraction = fraction_sampled_for_test , seed = 0 )
197- numrows = df_test .count ()
198- print (f"finished with numrows { numrows } " )
199-
200- # Train the Logistic Regression model
201- lr = LogisticRegression (num_workers = gpu_number , verbose = True , ** est_params , elasticNetParam = elasticNetParam )
202- # lr_model = lr.fit(df)
0 commit comments