bacpop
diff --git a/‎.github/workflows/azure_ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/azure_ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎PopPUNK/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎PopPUNK/__init__.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎PopPUNK/__main__.py‎
Lines changed: 14 additions & 7 deletions b/‎PopPUNK/__main__.py‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎PopPUNK/assign.py‎
Lines changed: 5 additions & 5 deletions b/‎PopPUNK/assign.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎PopPUNK/lineages.py‎
Lines changed: 43 additions & 25 deletions b/‎PopPUNK/lineages.py‎
Lines changed: 43 additions & 25 deletions
diff --git a/‎PopPUNK/models.py‎
Lines changed: 14 additions & 14 deletions b/‎PopPUNK/models.py‎
Lines changed: 14 additions & 14 deletions
@@ -30,8 +30,8 @@ jobs:
         micromamba-version: '1.4.6-0'
         environment-file: environment.yml
         # persist on the same day.
-        cache-environment-key: environment-${{ steps.date.outputs.date }}
-        cache-downloads-key: downloads-${{ steps.date.outputs.date }}
+#        cache-environment-key: environment-${{ steps.date.outputs.date }}
+#        cache-downloads-key: downloads-${{ steps.date.outputs.date }}
     - name: Install and run_test.py
       shell: bash -l {0}
       run: |
 
@@ -3,9 +3,13 @@
 
 '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
 
-__version__ = '2.7.7'
+__version__ = '2.7.8'
 
 # Minimum sketchlib version
 SKETCHLIB_MAJOR = 2
 SKETCHLIB_MINOR = 0
 SKETCHLIB_PATCH = 1
+
+# Lineage search depth default
+SEARCH_DEPTH_FACTOR = 10
+DEFAULT_LINEAGE_RESOLUTION = 1e-10
@@ -11,6 +11,7 @@
 
 # import poppunk package
 from .__init__ import __version__
+from .__init__ import SEARCH_DEPTH_FACTOR, DEFAULT_LINEAGE_RESOLUTION
 
 # globals
 accepted_weights_types = ["core", "accessory", "euclidean"]
@@ -190,7 +191,7 @@ def get_options():
                                 help='Number of kNN distances per sequence to filter when '
                                       'counting neighbours or using only reciprocal matches',
                                 type = int,
-                                default = None)
+                                default = 10000)
     lineagesGroup.add_argument('--write-lineage-networks',
                                 help='Save all lineage networks',
                                 action = 'store_true',
@@ -199,6 +200,10 @@ def get_options():
                                 help='Use accessory distances for lineage definitions [default = use core distances]',
                                 action = 'store_true',
                                 default = False)
+    lineagesGroup.add_argument('--lineage-resolution',
+                                help='Minimum genetic separation between isolates required to initiate a new lineage',
+                                type = float,
+                                default = DEFAULT_LINEAGE_RESOLUTION)
 
     other = parser.add_argument_group('Other options')
     other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]')
@@ -273,7 +278,6 @@ def main():
     from .utils import setupDBFuncs
     from .utils import readPickle, storePickle
     from .utils import createOverallLineage
-    from .utils import get_match_search_depth
     from .utils import check_and_set_gpu
 
     # check kmer properties
@@ -568,21 +572,24 @@ def main():
                 # Memory usage determined by maximum search depth
                 if args.max_search_depth is not None:
                     max_search_depth = int(args.max_search_depth)
-                elif args.max_search_depth is None and (args.reciprocal_only or args.count_unique_distances):
-                    max_search_depth = get_match_search_depth(refList,rank_list)
                 else:
-                    max_search_depth = max(rank_list)
+                    # By default retain a larger number of search distances
+                    # than the maximum requested rank because when counting only
+                    # unique distances, and merging distances differing by less
+                    # than epsilon, more than the max rank number of values is
+                    # required
+                    max_search_depth = max(rank_list)*SEARCH_DEPTH_FACTOR
 
                 model = LineageFit(output,
                                     rank_list,
                                     max_search_depth,
                                     args.reciprocal_only,
                                     args.count_unique_distances,
+                                    args.lineage_resolution,
                                     1 if args.use_accessory else 0,
                                     use_gpu = args.gpu_graph)
                 model.set_threads(args.threads)
-                model.fit(distMat,
-                            args.use_accessory)
+                model.fit(distMat)
 
                 assignments = {}
                 for rank in rank_list:
 
@@ -128,10 +128,11 @@ def get_options():
     # combine
     args = parser.parse_args()
 
-    # ensure directories do not have trailing forward slash
-    for arg in [args.db, args.model_dir, args.output, args.previous_clustering]:
-        if arg is not None:
-            arg = arg.rstrip('\\')
+    # ensure directories do not have trailing slash
+    for attr_name in ['db', 'model_dir', 'output', 'previous_clustering']:
+        attr_value = getattr(args, attr_name)
+        if attr_value is not None:
+            setattr(args, attr_name, attr_value.rstrip('\\').rstrip('/'))
 
     return args
 
@@ -275,7 +276,6 @@ def assign_query(dbFuncs,
     createDatabaseDir = dbFuncs['createDatabaseDir']
     constructDatabase = dbFuncs['constructDatabase']
     readDBParams = dbFuncs['readDBParams']
-
     if ref_db == output and overwrite == False:
         sys.stderr.write("--output and --db must be different to "
                          "prevent overwrite.\n")
 
@@ -11,13 +11,17 @@
 import pandas as pd
 from collections import defaultdict
 
+from .__init__ import SEARCH_DEPTH_FACTOR, DEFAULT_LINEAGE_RESOLUTION
+
 from .assign import assign_query_hdf5
 from .network import construct_network_from_edge_list, printClusters, save_network
 from .models import LineageFit
 from .plot import writeClusterCsv
 from .sketchlib import readDBParams
 from .qc import prune_distance_matrix, sketchlibAssemblyQC
-from .utils import createOverallLineage, get_match_search_depth, readPickle, setupDBFuncs
+from .utils import createOverallLineage, readPickle, setupDBFuncs, update_distance_matrices, storePickle
+
+import pp_sketchlib
 
 # command line parsing
 def get_options():
@@ -114,7 +118,7 @@ def get_options():
                                     help="Number of kNN distances per sequence to filter when "
                                     "counting neighbours or using only reciprocal matches",
                                     type = int,
-                                    default = None)
+                                    default = 10000)
     lGroup.add_argument('--use-accessory',
                                     help="Use accessory distances for lineage clustering",
                                     action = 'store_true',
@@ -130,6 +134,10 @@ def get_options():
                                     help="Only use reciprocal kNN matches for lineage definitions",
                                     action = 'store_true',
                                     default = False)
+    lGroup.add_argument('--lineage-resolution',
+                                help="Minimum genetic separation between isolates required to initiate a new lineage",
+                                type = float,
+                                default = DEFAULT_LINEAGE_RESOLUTION)
 
     return parser.parse_args()
 
@@ -165,15 +173,13 @@ def create_db(args):
     else:
         clustering_file = args.external_clustering
     strains = pd.read_csv(clustering_file, dtype = str).groupby(args.clustering_col_name)
-
+    
     sys.stderr.write("Extracting properties of database\n")
     # Get rlist
     if args.distances is None:
         distances = os.path.join(args.create_db,os.path.basename(args.create_db) + ".dists")
     else:
         distances = args.distances
-    # Get distances
-    rlist, qlist, self, X = readPickle(distances, enforce_self=False, distances=True)
     # Get parameters
     kmers, sketch_sizes, codon_phased = readDBParams(args.create_db)
     # Ranks to use
@@ -185,9 +191,15 @@ def create_db(args):
         else:
             max_search_depth = args.max_search_depth
     else:
-        max_search_depth = get_match_search_depth(rlist,rank_list)
+        # By default retain a larger number of search distances
+        # than the maximum requested rank because when counting only
+        # unique distances, and merging distances differing by less
+        # than epsilon, more than the max rank number of values is
+        # required
+        max_search_depth = max(rank_list)*SEARCH_DEPTH_FACTOR
 
     sys.stderr.write("Generating databases for individual strains\n")
+    all_isolates = list()
     # Dicts for storing typing information
     lineage_dbs = {}
     overall_lineage = {}
@@ -199,6 +211,7 @@ def create_db(args):
         num_isolates = len(isolate_list)
         if num_isolates >= args.min_count:
             lineage_dbs[strain] = strain_db_name
+            all_isolates.extend(isolate_list)
             if os.path.isdir(strain_db_name) and args.overwrite:
                 sys.stderr.write("--overwrite means {strain_db_name} will be deleted now\n")
                 shutil.rmtree(strain_db_name)
@@ -217,27 +230,32 @@ def create_db(args):
                 shutil.rmtree(dest_db)
             elif not os.path.exists(dest_db):
                 os.symlink(rel_path,dest_db)
-            # Extract sparse distances
-            prune_distance_matrix(rlist,
-                            list(set(rlist) - set(isolate_list)),
-                            X,
-                            os.path.join(strain_db_name,strain_db_name + '.dists'))
+            # Store isolate names
+            storePickle(isolate_list, isolate_list, True, None, os.path.join(strain_db_name,strain_db_name + '.dists'))
+            # Calculate within-strain distances
+            strain_distMat = pp_sketchlib.queryDatabase(ref_db_name=dest_db.replace('.h5',''),
+                                                        query_db_name=dest_db.replace('.h5',''),
+                                                        rList=isolate_list,
+                                                        qList=isolate_list,
+                                                        klist=kmers.tolist(),
+                                                        random_correct=True,
+                                                        jaccard=False,
+                                                        num_threads=args.threads,
+                                                        use_gpu = args.gpu_dist,
+                                                        device_id = args.deviceid)
+
             # Initialise model
             model = LineageFit(strain_db_name,
                       rank_list,
                       max_search_depth,
                       args.reciprocal_only,
                       args.count_unique_distances,
+                      args.lineage_resolution,
+                      dist_col = 1 if args.use_accessory else 0,
                       use_gpu = args.gpu_graph)
             model.set_threads(args.threads)
-            # Load pruned distance matrix
-            strain_rlist, strain_qlist, strain_self, strain_X = \
-                                readPickle(os.path.join(strain_db_name,strain_db_name + '.dists'),
-                                            enforce_self=False,
-                                            distances=True)
             # Fit model
-            model.fit(strain_X,
-                        args.use_accessory)
+            model.fit(strain_distMat)
             # Lineage fit requires some iteration
             indivNetworks = {}
             lineage_clusters = defaultdict(dict)
@@ -246,8 +264,8 @@ def create_db(args):
                 if rank <= num_isolates:
                     assignments = model.assign(rank)
                 # Generate networks
-                indivNetworks[rank] = construct_network_from_edge_list(strain_rlist,
-                                                            strain_rlist,
+                indivNetworks[rank] = construct_network_from_edge_list(isolate_list,
+                                                            isolate_list,
                                                             assignments,
                                                             weights = None,
                                                             betweenness_sample = None,
@@ -262,7 +280,7 @@ def create_db(args):
                 # Identify clusters from output
                 lineage_clusters[rank] = \
                     printClusters(indivNetworks[rank],
-                                  strain_rlist,
+                                  isolate_list,
                                   printCSV = False,
                                   use_gpu = args.gpu_graph)[0]
                 n_clusters = max(lineage_clusters[rank].values())
@@ -271,8 +289,8 @@ def create_db(args):
             # For each strain, print output of each rank as CSV
             overall_lineage[strain] = createOverallLineage(rank_list, lineage_clusters)
             writeClusterCsv(os.path.join(strain_db_name,os.path.basename(strain_db_name) + '_lineages.csv'),
-                strain_rlist,
-                strain_rlist,
+                isolate_list,
+                isolate_list,
                 overall_lineage[strain],
                 output_format = 'phandango',
                 epiCsv = None,
@@ -282,12 +300,12 @@ def create_db(args):
             model.save()
 
     # Print combined strain and lineage clustering
-    print_overall_clustering(overall_lineage,args.output + '.csv',rlist)
+    print_overall_clustering(overall_lineage,args.output + '.csv',all_isolates)
 
     # Write scheme to file
     with open(args.db_scheme, 'wb') as pickle_file:
         pickle.dump([args.create_db,
-                      rlist,
+                      isolate_list,
                       args.model_dir,
                       clustering_file,
                       args.clustering_col_name,
 
@@ -1120,11 +1120,13 @@ class LineageFit(ClusterFit):
             The ranks used in the fit
     '''
 
-    def __init__(self, outPrefix, ranks, max_search_depth, reciprocal_only, count_unique_distances, dist_col = None, use_gpu = False):
+    def __init__(self, outPrefix, ranks, max_search_depth, reciprocal_only,
+                  count_unique_distances, lineage_resolution, dist_col = None, use_gpu = False):
         ClusterFit.__init__(self, outPrefix)
         self.type = 'lineage'
         self.preprocess = False
-        self.max_search_depth = max_search_depth+5 # Set to highest rank by default in main; need to store additional distances
+        max_rank = max(ranks)
+        self.max_search_depth = max(max_search_depth,max_rank+5) # Set to highest rank by default in main; need to store additional distances
                                                    # when there is redundancy (e.g. reciprocal matching, unique distance counting)
                                                    # or other sequences may be pruned out of the database
         self.nn_dists = None # stores the unprocessed kNN at the maximum search depth
@@ -1139,6 +1141,7 @@ def __init__(self, outPrefix, ranks, max_search_depth, reciprocal_only, count_un
         self.reciprocal_only = reciprocal_only
         self.count_unique_distances = count_unique_distances
         self.dist_col = dist_col
+        self.resolution = lineage_resolution
         self.use_gpu = use_gpu
 
     def __save_sparse__(self, data, row, col, rank, n_samples, dtype, is_nn_dist = False):
@@ -1177,6 +1180,7 @@ def __reduce_rank__(self, higher_rank_sparse_mat, lower_rank, n_samples, dtype):
                 lower_rank,
                 self.reciprocal_only,
                 self.count_unique_distances,
+                self.resolution,
                 self.threads)
         self.__save_sparse__(lower_rank_sparse_mat[2],
                              lower_rank_sparse_mat[0],
@@ -1185,7 +1189,7 @@ def __reduce_rank__(self, higher_rank_sparse_mat, lower_rank, n_samples, dtype):
                              n_samples,
                              dtype)
 
-    def fit(self, X, accessory):
+    def fit(self, X):
         '''Extends :func:`~ClusterFit.fit`
 
         Gets assignments by using nearest neigbours.
@@ -1194,8 +1198,6 @@ def fit(self, X, accessory):
             X (numpy.array)
                 The core and accessory distances to cluster. Must be set if
                 preprocess is set.
-            accessory (bool)
-                Use accessory rather than core distances
 
         Returns:
             y (numpy.array)
@@ -1205,23 +1207,20 @@ def fit(self, X, accessory):
         ClusterFit.fit(self, X)
         sample_size = int(round(0.5 * (1 + np.sqrt(1 + 8 * X.shape[0]))))
         if (max(self.ranks) >= sample_size):
-            sys.stderr.write("Rank must be less than the number of samples")
+            sys.stderr.write("Maximum rank must be less than the number of samples: " + str(sample_size) + "\n")
             sys.exit(0)
 
-        if accessory:
-            self.dist_col = 1
-        else:
-            self.dist_col = 0
+        search_depth = min(self.max_search_depth,sample_size-1)
 
         row, col, data = \
             poppunk_refine.get_kNN_distances(
                 distMat=pp_sketchlib.longToSquare(distVec=X[:, [self.dist_col]],
                                                   num_threads=self.threads),
-                kNN=self.max_search_depth,
+                kNN=search_depth,
                 dist_col=self.dist_col,
                 num_threads=self.threads
             )
-        self.__save_sparse__(data, row, col, self.max_search_depth, sample_size, X.dtype,
+        self.__save_sparse__(data, row, col, search_depth, sample_size, X.dtype,
                               is_nn_dist = True)
 
         # Apply filtering of links if requested and extract lower ranks - parallelisation within C++ code
@@ -1258,7 +1257,8 @@ def save(self):
                                 self.max_search_depth,
                                 self.reciprocal_only,
                                 self.count_unique_distances,
-                                self.dist_col],
+                                self.dist_col,
+                                self.resolution],
                                 self.type],
                             pickle_file)
 
@@ -1271,7 +1271,7 @@ def load(self, fit_npz, fit_obj):
             fit_obj (sklearn.mixture.BayesianGaussianMixture)
                 The saved fit object
         '''
-        self.ranks, self.max_search_depth, self.reciprocal_only, self.count_unique_distances, self.dist_col = fit_obj
+        self.ranks, self.max_search_depth, self.reciprocal_only, self.count_unique_distances, self.dist_col, self.resolution = fit_obj
         self.nn_dists = fit_npz
         self.fitted = True