@@ -233,10 +233,12 @@ def get_column_data_type(data: Union[np.ndarray, list], full_data: pd.DataFrame,
233233 if all (isinstance (x , str ) for x in data ):
234234 can_be_tags = True
235235
236+ mean_lenghts = np .mean (lengths ) if len (lengths ) > 0 else 0
237+
236238 # If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa
237- if (can_be_tags and np . mean ( lengths ) > 1.3 and
239+ if (can_be_tags and mean_lenghts > 1.3 and
238240 6 <= len (unique_tokens ) <= 30 and
239- len (unique_tokens ) / np . mean ( lengths ) < (len (data ) / 4 )):
241+ len (unique_tokens ) / mean_lenghts < (len (data ) / 4 )):
240242 curr_dtype = dtype .tags
241243
242244 # Categorical based on unique values
@@ -392,9 +394,10 @@ def infer_types(
392394 f'from a total population of { population_size } , this is equivalent to { round (sample_size * 100 / population_size , 1 )} % of your data.' ) # noqa
393395
394396 nr_procs = get_nr_procs (df = sample_df )
395- if data .size > mp_cutoff and nr_procs > 1 :
396- log .info (f'Using { nr_procs } processes to deduct types.' )
397- pool = mp .Pool (processes = nr_procs )
397+ pool_size = min (nr_procs , len (sample_df .columns .values ))
398+ if data .size > mp_cutoff and pool_size > 1 :
399+ log .info (f'Using { pool_size } processes to deduct types.' )
400+ pool = mp .Pool (processes = pool_size )
398401 # column-wise parallelization # TODO: evaluate switching to row-wise split instead
399402 answer_arr = pool .starmap (get_column_data_type , [
400403 (sample_df [x ].dropna (), data [x ], x , pct_invalid ) for x in sample_df .columns .values
@@ -422,8 +425,8 @@ def infer_types(
422425 'dtype_dist' : data_dtype_dist
423426 }
424427
425- if data .size > mp_cutoff and nr_procs > 1 :
426- pool = mp .Pool (processes = nr_procs )
428+ if data .size > mp_cutoff and pool_size > 1 :
429+ pool = mp .Pool (processes = pool_size )
427430 answer_arr = pool .map (get_identifier_description_mp , [
428431 (data [x ], x , type_information .dtypes [x ])
429432 for x in sample_df .columns
0 commit comments