fixes

Marcello-Sega · Marcello-Sega · commit bdf5070b1c86 · 2026-04-27T08:41:28.000+01:00
diff --git a/docs/source/newproperties.rst b/docs/source/newproperties.rst
@@ -96,6 +96,15 @@ This results in the following (isolated molecules not shown)
 +----------------------------+
 
 
+Surface clusters
+--------
+
+The value of :py:obj:`atoms.surface_clusters` identify the surface cluster for
+classes like :class:`~pytim.itim.GITIM`, provided that the option :obj:`surface_cluster_cut`
+is not `None`. The label is -1 if the atom is not a surface one, and a progressive number
+in order of growing size of surface atom clusters.
+
+
 Sides
 -----
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
   "cython>=3.0.8",
   "gsd>3.0.0",
   "MDAnalysis>=2.8.0",
-  "scipy>=1.12.0",
+  "scipy>=1.15.0",
   "setuptools",
   "PyWavelets>=1.8.0",
   "scikit-image>=0.24.0",
diff --git a/pytim/gitim.py b/pytim/gitim.py
@@ -54,6 +54,10 @@ class GITIM(Interface):
                                     search, if cluster_cut is not None
         :param Object extra_cluster_groups: Additional groups, to allow for
                                     mixed interfaces
+        :param int extra_cluster_count: Number of clusters (sorted by decreasing
+                                    size) to be considered the majority component of
+                                    the opposite phase.
+                                    
         :param int n_clusters:      Tag as surface atoms/molecules only
                                     those in the n_clusters largest clusters.
                                     Default: None, uses all clusters.
@@ -75,7 +79,6 @@ class GITIM(Interface):
                                     cluster of (initially detected) surface
                                     ones.
                                     Default: None, disables the filtering.
-        :param str symmetry:        Gives the code a hint about the topology
         :param str symmetry:        Gives the code a hint about the topology
                                     of the interface: 'generic' (default)
                                     or  'planar'
@@ -143,6 +146,7 @@ def __init__(self,
                  include_zero_radius=False,
                  cluster_threshold_density=None,
                  extra_cluster_groups=None,
+                 extra_cluster_count=1,
                  n_clusters=None,
                  min_cluster_size=None,
                  biggest_cluster_only = False, # backward compatibility, sets n_clusters
@@ -290,10 +294,9 @@ def _assign_layers_setup(self):
         # from the triangulation and updating the circumradius of the neighbors
         # of the removed points  only.
 
-        dbs = utilities.do_cluster_analysis_dbscan
-        return alpha_group, dbs
+        return alpha_group
 
-    def _assign_layers_postprocess(self, dbs, group, alpha_group, layer):
+    def _assign_layers_postprocess(self, group, alpha_group, layer):
         if len(group) > 0:
             if self.molecular:
                 group = group.residues.atoms
@@ -303,12 +306,20 @@ def _assign_layers_postprocess(self, dbs, group, alpha_group, layer):
         alpha_group = alpha_group[:] - group[:]
         self.label_group(
             self._layers[layer], beta=1. * (layer + 1), layer=(layer + 1))
+
+        if self.surface_cluster_cut is not None:
+            self.label_group(group.universe.atoms,surface_cluster=-1)
+            sorted_groups = self._generate_surface_clusters(
+                    group, self.surface_cluster_cut)
+            for i,g in enumerate(sorted_groups):
+                self.label_group(g,surface_cluster=i)
+
         return alpha_group
 
     def _assign_layers(self):
         """Determine the GITIM layers."""
 
-        alpha_group, dbs = self._assign_layers_setup()
+        alpha_group = self._assign_layers_setup()
 
         self.triangulation = []  # storage for triangulations
 
@@ -317,12 +328,9 @@ def _assign_layers(self):
             alpha_ids = self.alpha_shape(self.alpha, alpha_group, layer)
 
             group = alpha_group[alpha_ids]
-            if self.surface_cluster_cut is not None:
-                group = self._generate_surface_clusters(
-                    group, self.surface_cluster_cut)
 
             alpha_group = self._assign_layers_postprocess(
-                dbs, group, alpha_group, layer)
+                group, alpha_group, layer)
 
         # reset the interpolator
         self._interpolator = None
diff --git a/pytim/interface.py b/pytim/interface.py
@@ -68,6 +68,11 @@ class Interface(object):
     extra_cluster_groups, _extra_cluster_groups =\
         _create_property('extra_cluster_groups',
                          "(ndarray) additional cluster groups")
+
+    extra_cluster_count, _extra_cluster_count=\
+        _create_property('extra_cluster_count',
+                         "(int) maximum number of extra group clusters (sorted by decreasing size) to exclude.")
+
     radii_dict, _radii_dict =\
         _create_property('radii_dict', "(dict) custom atomic radii")
 
@@ -126,6 +131,7 @@ def label_group(self,
                     beta=None,
                     layer=None,
                     cluster=None,
+                    surface_cluster=None,
                     side=None):
         if group is None:
             raise RuntimeError(
@@ -146,6 +152,8 @@ def label_group(self,
             _group.sides = side
         if cluster is not None:
             _group.clusters = cluster
+        if surface_cluster is not None:
+            _group.surface_clusters = surface_cluster
 
     def _assign_symmetry(self, symmetry):
         if self.analysis_group is None:
@@ -158,72 +166,71 @@ def _assign_symmetry(self, symmetry):
             self.symmetry = symmetry
 
     def _generate_surface_clusters(self, group, cut):
-        # at the moment, selects only the biggest cluster
-        labels, counts, neighs = utilities.do_cluster_analysis_dbscan(
-                group, cut,
+        labels, counts, neighs, _ = utilities.do_cluster_analysis_dbscan(
+                group=group, cluster_cut=cut,threshold_density=None,
                 molecular=False)
-        return group[np.where(labels == np.argmax(counts))[0]]
+        sortid = np.argsort(counts,stable=True)[::-1]
+        sortid = sortid[counts[sortid]>0] # just the clusters with more than one element
+        self.surface_clusters =  [group[labels==s] for s in sortid]
+        return self.surface_clusters
 
     def _define_cluster_group(self):
         self.universe.atoms.pack_into_box()
         self.cluster_group = self.universe.atoms[:0]  # empty
         if (self.cluster_cut is not None):
-            cluster_cut = float(self.cluster_cut[0])
             # we start by adding the atoms in the smaller clusters
-            # of the opposit phase, if extra_cluster_groups are provided
+            # of the opposite phase, if extra_cluster_groups are provided
+            self._min_samples=[None]
             if (self.extra_cluster_groups is not None):
-                for extra in self.extra_cluster_groups:
-                    x_labels, x_counts, _ = utilities.do_cluster_analysis_dbscan(
-                        extra, cluster_cut, self.cluster_threshold_density,
-                        self.molecular)
+                for i,extra in enumerate(self.extra_cluster_groups):
+                    if len(self.cluster_cut) == 1:
+                        cluster_cut = self.cluster_cut[0]
+                    else: 
+                        cluster_cut = self.cluster_cut[i+1]
+                    if len(self.cluster_threshold_density) == 1:
+                        cluster_threshold_density = self.cluster_threshold_density[0]
+                    else: 
+                        cluster_threshold_density = self.cluster_threshold_density[i+1]
+                    x_labels, x_counts, _ , min_samples = utilities.do_cluster_analysis_dbscan(
+                        group=extra, cluster_cut=cluster_cut,
+                        threshold_density=cluster_threshold_density,
+                        molecular=self.molecular)
                     x_labels = np.array(x_labels)
-                    x_label_max = np.argmax(x_counts)
-                    x_ids_other = np.where(x_labels != x_label_max)[0]
-
+                    x_label_selection = np.argsort(x_counts)[::-1][:self.extra_cluster_count]
+                    x_ids_other = np.where(~np.isin(x_labels, x_label_selection))[0]
+                    self._min_samples.append(float(min_samples))
                     self.cluster_group += extra[x_ids_other]
-
+                    self.minority_cluster_group = extra[x_ids_other]
             # next, we add the atoms belonging to the main phase
             self.cluster_group += self.analysis_group
 
             # groups have been checked already in _sanity_checks()
             # self.cluster_group at this stage is composed of analysis_group +
             # the smaller clusters of the other phase
-            labels, counts, neighbors = utilities.do_cluster_analysis_dbscan(
-                self.cluster_group, cluster_cut,
-                self.cluster_threshold_density, self.molecular)
+            labels, counts, neighbors, min_samples = utilities.do_cluster_analysis_dbscan(
+                self.cluster_group, self.cluster_cut[0],
+                self.cluster_threshold_density[0], self.molecular)
             labels = np.array(labels)
-
+            self._min_samples[0] = float(min_samples)
             # counts is not necessarily ordered by size of cluster.
+            # we sort it and remember that its index corresponds to the 
+            # label
             sorting = np.argsort(counts,kind='stable')[::-1]
             # labels for atoms in each cluster starting from the largest
-            unique_labels = np.sort(np.unique(labels[labels > -1]))
+            # discarding cases where counts are zero (exhausted the labels)
+            unique_labels = [int(lab) for lab in sorting if counts[lab] > 0]
             # by default, all elements of the cluster_group are in
             # single-molecule/atom clusters. We will update them right after.
             self.label_group(self.cluster_group, cluster=-1)
-            # we go in reverse order to let smaller labels (bigger clusters)
-            # overwrite larger labels (smaller cluster) when the molecular
+            # we let bigger clusters overwrite smaller cluster when the molecular
             # option is used.
             for el in unique_labels[::-1]:
                 # select a label
-                cond = np.where(labels == el)
+                cond = (labels == el)
                 if self.molecular is True:
                     g_ = self.cluster_group[cond].residues.atoms
                 else:
                     g_ = self.cluster_group[cond]
-                # probably we need an example here, say:
-                # counts = [ 61, 1230, 34, 0, ...  0 ,0 ]
-                # labels = [ 0, 1, 2, 1, -1  ....  -1 ]
-                # we have three clusters, of 61, 1230 and 34 atoms.
-                # There are 61 labels '0'
-                #         1230 labels '1'
-                #           34 labels '2'
-                #         the remaining are '-1'
-                #
-                # sorting = [1,0,2,3,....] i.e. the largest element is in
-                #     (1230) position 1, the next (61) is in position 0, ...
-                # Say, g_ is now the group with label '1' (the biggest cluster)
-                # Using argwhere(sorting==1) returns exactly 0 -> the right
-                # ordered label for the largest cluster.
                 self.label_group(g_, cluster=np.argwhere(sorting == el)[0, 0])
             # now that labels are assigned for each of the clusters,
             # we can restric the cluster group to the largest cluster.
@@ -253,6 +260,8 @@ def _define_cluster_group(self):
         else:
             self.cluster_group = self.analysis_group
             self.label_group(self.cluster_group, cluster=0)
+        if len(self.cluster_group) == 0:
+            raise ValueError('Empty cluster group: change your cluster search settings.')
 
     def is_buried(self, pos):
         """ Checks wether an array of positions are located below
diff --git a/pytim/properties.py b/pytim/properties.py
@@ -23,6 +23,13 @@ class Clusters(MDAnalysis.core.topologyattrs.AtomAttr):
     per_object = 'atom'
 
 
+class SurfaceClusters(MDAnalysis.core.topologyattrs.AtomAttr):
+    """Clusters for each surface atom"""
+    attrname = 'surface_clusters'
+    singular = 'surface_cluster'
+    per_object = 'atom'
+
+
 class Sides(MDAnalysis.core.topologyattrs.AtomAttr):
     """Sides for each atom"""
     attrname = 'sides'
@@ -82,7 +89,7 @@ def _missing_attributes(interface, universe):
 def _extra_attributes(interface, universe):
     # we add here the new layer, cluster and side information
     # they are not part of MDAnalysis.core.topologyattrs
-    attr = {'layers': Layers, 'clusters': Clusters, 'sides': Sides}
+    attr = {'layers': Layers, 'clusters': Clusters, 'sides': Sides, 'surface_clusters': SurfaceClusters}
     for key in attr.keys():
         if key not in dir(universe.atoms):
             vals = np.zeros(len(universe.atoms), dtype=int) - 1
diff --git a/pytim/sanity_check.py b/pytim/sanity_check.py
@@ -111,14 +111,19 @@ def assign_cluster_params(self,
         elements = 0
         extraelements = -1
 
-        self.interface.cluster_threshold_density = cluster_threshold_density
 
         # we first make sure cluster_cut is either None, or an array
         if isinstance(cluster_cut, (int, float)):
             self.interface.cluster_cut = np.array([float(cluster_cut)])
         else:
             self.interface.cluster_cut = cluster_cut
 
+        # same with the cluster threshold
+        if isinstance(cluster_threshold_density, (int, float, str, type(None))):
+            self.interface.cluster_threshold_density = np.array([cluster_threshold_density])
+        else:
+            self.interface.cluster_threshold_density = cluster_threshold_density
+
         # same with extra_cluster_groups
         if not isinstance(extra_cluster_groups,
                           (list, tuple, np.ndarray, type(None))):
diff --git a/pytim/utilities_dbscan.py b/pytim/utilities_dbscan.py
@@ -19,7 +19,8 @@ def determine_samples(threshold_density, cluster_cut, n_neighbors):
     elif (threshold_density == 'auto'):
         modes = 2
         centroid, _ = vq.kmeans2(
-            n_neighbors * 1.0, modes, iter=10, check_finite=False)
+            n_neighbors * 1.0, modes, iter=10, check_finite=False,
+            rng=5317) # rng used to set the seed for reproducibile results
         min_samples = np.max(centroid)
 
     else:
@@ -35,11 +36,13 @@ def do_cluster_analysis_dbscan(group,
                                molecular=True):
     """ Performs a cluster analysis using DBSCAN
 
-        :returns [labels,counts,neighbors]: lists of the id of the cluster to
-                                  which every atom is belonging to, of the
-                                  number of elements in each cluster, and of
-                                  the number of neighbors for each atom
-                                  according to the specified criterion.
+        :returns [labels,counts,neighbors,min_samples]: lists of the id of
+                 the cluster to which every atom is belonging to, of the
+                 number of elements in each cluster,and of the number of
+                 neighbors for each atom according to the specified criterion.
+                 The last item, min_samples, is the threshold used in the
+                 clustering algorithm as passed or determined with the 'auto'
+                 option.
 
         Uses a slightly modified version of DBSCAN from sklearn.cluster
         that takes periodic boundary conditions into account (through
@@ -71,13 +74,12 @@ def do_cluster_analysis_dbscan(group,
 
     min_samples = determine_samples(threshold_density, cluster_cut,
                                     n_neighbors)
-
     labels = -np.ones(points.shape[0], dtype=np.intp)
     counts = np.zeros(points.shape[0], dtype=np.intp)
 
     core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
     dbscan_inner(core_samples, neighborhoods, labels, counts)
-    return labels, counts, n_neighbors
+    return labels, counts, n_neighbors, min_samples
 
 
 def _():
@@ -94,10 +96,10 @@ def _():
     >>> u = mda.Universe(ILBENZENE_GRO)
     >>> benzene = u.select_atoms('name C and resname LIG')
     >>> u.atoms.positions = u.atoms.pack_into_box()
-    >>> l,c,n =  DBScan(benzene, cluster_cut = 4.5, threshold_density = None)
-    >>> l1,c1,n1 = DBScan(benzene, cluster_cut = 8.5, threshold_density = 'auto')
+    >>> l,c,n,t =  DBScan(benzene, cluster_cut = 4.5, threshold_density = None)
+    >>> l1,c1,n1,t1 = DBScan(benzene, cluster_cut = 8.5, threshold_density = 'auto')
     >>> td = 0.009
-    >>> l2,c2,n2 = DBScan(benzene, cluster_cut = 8.5, threshold_density = td)
+    >>> l2,c2,n2,t2 = DBScan(benzene, cluster_cut = 8.5, threshold_density = td)
     >>> print (np.sort(c)[-2:])
     [   12 14904]
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 cython>=3.0.8
 numpy>=2.1.3
-scipy>=1.12.0
+scipy>=1.15.0
 gsd>3.0.0
 setuptools
 MDAnalysis>=2.8.0