@@ -599,6 +599,7 @@ def cluster_single_parameters(df, config, modality):
599599 # For example, if there are four runs with five elements and 10 runs with three
600600 # elements, we should cluster the five-element runs separately from the
601601 # three-element runs, and account for that in the clustering labels.
602+ print (f"Applying array clustering to { column_name } " )
602603 lengths = ["x" .join (str (i ) for i in np .array (x ).shape ) for x in column_data ]
603604 unique_lengths = np .unique (lengths )
604605 cluster_idx = 0
@@ -624,6 +625,7 @@ def cluster_single_parameters(df, config, modality):
624625 df .loc [sel_rows , f"Cluster_{ column_name } " ] = cluster_idx
625626 cluster_idx += 1
626627 else :
628+ print (f"Applying non-array clustering to { column_name } " )
627629 array = df [column_name ].to_numpy ().reshape (- 1 , 1 )
628630
629631 # Handle NaNs correctly: Ignore NaNs instead of replacing with -999
@@ -656,6 +658,7 @@ def cluster_single_parameters(df, config, modality):
656658 df [f"Cluster_{ column_name } " ] = cluster_labels
657659
658660 else :
661+ print (f"Not clustering { column_name } " )
659662 # We can rely on string matching (done separately) for string-type fields,
660663 # but arrays of strings need to be handled differently.
661664 column_data = df [column_name ].tolist ()
0 commit comments