add parallel clustering option to base

yusuferentunc · yusuferentunc · commit b6071b40b89b · 2025-11-14T13:53:33.000+01:00
diff --git a/domhmm/analysis/base.py b/domhmm/analysis/base.py
@@ -76,6 +76,8 @@ class LeafletAnalysisBase(AnalysisBase):
         User-specific HMM (e.g., pre-trained on another simulation)
     do_clustering: bool
         Perform the hierarchical clustering for each frame
+    parallel_clustering: bool
+        Perform the hierarchical clustering in parallel
     n_init_hmm: int
         Number of repeats for HMM model trainings
 
@@ -128,6 +130,7 @@ def __init__(
             n_init_hmm: int = 2,
             save_plots: bool = False,
             do_clustering: bool = True,
+            parallel_clustering: bool = False,
             **kwargs
     ):
         # the below line must be kept to initialize the AnalysisBase class!
@@ -158,6 +161,7 @@ def __init__(
         self.n_init_hmm = n_init_hmm
         self.save_plots = save_plots
         self.do_clustering = do_clustering
+        self.parallel_clustering = parallel_clustering
 
         assert heads.keys() == tails.keys(), "Heads and tails don't contain same residue names"
 
diff --git a/domhmm/analysis/domhmm.py b/domhmm/analysis/domhmm.py
@@ -404,7 +404,10 @@ def _conclude(self):
             pass
         else:
             log.info("Clustering is starting.")
-            self.result_clustering()
+            if self.parallel_clustering:
+                self.result_clustering_parallel()
+            else:
+                self.result_clustering_serial()
 
             if self.result_plots:
                 self.clustering_plot()
@@ -1450,7 +1453,7 @@ def _process_frame_leaflet(args):
 
         return (j, frame_number, cluster_result)
 
-    def result_clustering(self):
+    def result_clustering_parallel(self):
         """
         Runs hierarchical clustering for each frame and saves result (parallelized).
         """
@@ -1472,7 +1475,7 @@ def result_clustering(self):
                 }
                 tasks.append((i, j, frame_data))
 
-        print(f"Total CPU count is {mp.cpu_count()}")
+        log.info(f"{mp.cpu_count()} CPU cores will be used for hierarchical clustering")
         with mp.Pool(processes=mp.cpu_count()) as pool:
             results = list(tqdm(pool.imap(self._process_frame_leaflet, tasks), total=len(tasks)))