Skip to content

Commit 81d436d

Browse files
authored
Merge pull request #320 from bacpop/sparse_dist_assign
PopPUNK 2.7.0 candidate
2 parents cdf1b2c + 173e65f commit 81d436d

15 files changed

Lines changed: 186 additions & 147 deletions

PopPUNK/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
55

6-
__version__ = '2.6.7'
6+
__version__ = '2.7.0'
77

88
# Minimum sketchlib version
99
SKETCHLIB_MAJOR = 2

PopPUNK/assign.py

Lines changed: 97 additions & 85 deletions
Large diffs are not rendered by default.

PopPUNK/lineages.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import argparse
88
import subprocess
99
import pickle
10+
import shutil
1011
import pandas as pd
1112
from collections import defaultdict
1213

@@ -142,16 +143,18 @@ def main():
142143
create_db(args)
143144
elif args.query_db is not None:
144145
query_db(args)
145-
146+
146147

147148
def create_db(args):
148149

149150
# Check if output files exist
150151
if not args.overwrite:
151152
if os.path.exists(args.output + '.csv'):
152153
sys.stderr.write('Output file ' + args.output + '.csv exists; use --overwrite to replace it\n')
154+
sys.exit(1)
153155
if os.path.exists(args.db_scheme):
154156
sys.stderr.write('Output file ' + args.db_scheme + ' exists; use --overwrite to replace it\n')
157+
sys.exit(1)
155158

156159
sys.stderr.write("Identifying strains in existing database\n")
157160
# Read in strain information
@@ -197,7 +200,8 @@ def create_db(args):
197200
if num_isolates >= args.min_count:
198201
lineage_dbs[strain] = strain_db_name
199202
if os.path.isdir(strain_db_name) and args.overwrite:
200-
os.rmdir(strain_db_name)
203+
sys.stderr.write("--overwrite means {strain_db_name} will be deleted now\n")
204+
shutil.rmtree(strain_db_name)
201205
if not os.path.isdir(strain_db_name):
202206
try:
203207
os.makedirs(strain_db_name)
@@ -209,7 +213,8 @@ def create_db(args):
209213
dest_db = os.path.join(strain_db_name,os.path.basename(strain_db_name) + '.h5')
210214
rel_path = os.path.relpath(src_db, os.path.dirname(dest_db))
211215
if os.path.exists(dest_db) and args.overwrite:
212-
os.remove(dest_db)
216+
sys.stderr.write("--overwrite means {dest_db} will be deleted now\n")
217+
shutil.rmtree(dest_db)
213218
elif not os.path.exists(dest_db):
214219
os.symlink(rel_path,dest_db)
215220
# Extract sparse distances
@@ -304,7 +309,7 @@ def create_db(args):
304309

305310

306311
def query_db(args):
307-
312+
308313
# Read querying scheme
309314
with open(args.db_scheme, 'rb') as pickle_file:
310315
ref_db, rlist, model_dir, clustering_file, args.clustering_col_name, distances, \
@@ -374,6 +379,7 @@ def query_db(args):
374379
False, # write references - need to consider whether to support ref-only databases for assignment
375380
distances,
376381
False, # serial - needs to be supported for web version?
382+
None, # stable - not supported here
377383
args.threads,
378384
True, # overwrite - probably OK?
379385
False, # plot_fit - turn off for now
@@ -420,6 +426,7 @@ def query_db(args):
420426
False, # write references - need to consider whether to support ref-only databases for assignment
421427
lineage_distances,
422428
False, # serial - needs to be supported for web version?
429+
None, # stable - not supported here
423430
args.threads,
424431
True, # overwrite - probably OK?
425432
False, # plot_fit - turn off for now
@@ -434,10 +441,10 @@ def query_db(args):
434441
args.gpu_graph,
435442
save_partial_query_graph = False)
436443
overall_lineage[strain] = createOverallLineage(rank_list, lineageClustering)
437-
444+
438445
# Print combined strain and lineage clustering
439446
print_overall_clustering(overall_lineage,args.output + '.csv',qNames)
440-
447+
441448

442449
def print_overall_clustering(overall_lineage,output,include_list):
443450

@@ -455,7 +462,7 @@ def print_overall_clustering(overall_lineage,output,include_list):
455462
isolate_info[isolate].append(str(overall_lineage[strain][rank][isolate]))
456463
else:
457464
isolate_info[isolate] = [str(strain),str(overall_lineage[strain][rank][isolate])]
458-
465+
459466
# Print output
460467
with open(output,'w') as out:
461468
out.write('id,Cluster,')

PopPUNK/network.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,7 +1425,7 @@ def printClusters(G, rlist, outPrefix=None, oldClusterFile=None,
14251425
if use_gpu:
14261426
component_assignments = cugraph.components.connectivity.connected_components(G)
14271427
component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
1428-
newClusters = [set() for rank in range(component_frequencies.size)]
1428+
newClusters = [set() for _ in range(component_frequencies.size)]
14291429
for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
14301430
component = component_assignments['labels'].iloc[isolate_index].item()
14311431
component_rank_bool = component_frequencies.index == component
@@ -1448,7 +1448,7 @@ def printClusters(G, rlist, outPrefix=None, oldClusterFile=None,
14481448
oldClusters = oldAllClusters[list(oldAllClusters.keys())[0]]
14491449
# parse all previously used clusters, including those that are merged
14501450
parsed_oldClusters = set([int(item) for sublist in (x.split('_') for x in oldClusters) for item in sublist])
1451-
1451+
14521452
new_id = max(parsed_oldClusters) + 1 # 1-indexed
14531453
while new_id in parsed_oldClusters:
14541454
new_id += 1 # in case clusters have been merged

PopPUNK/utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,15 @@ def storePickle(rlist, qlist, self, X, pklName):
146146
Whether an all-vs-all self DB (for :func:`~iterDistRows`)
147147
X (numpy.array)
148148
n x 2 array of core and accessory distances
149+
150+
If None, do not save
149151
pklName (str)
150152
Prefix for output files
151153
"""
152154
with open(pklName + ".pkl", 'wb') as pickle_file:
153155
pickle.dump([rlist, qlist, self], pickle_file)
154-
np.save(pklName + ".npy", X)
156+
if isinstance(X, np.ndarray):
157+
np.save(pklName + ".npy", X)
155158

156159

157160
def readPickle(pklName, enforce_self=False, distances=True):
@@ -266,7 +269,7 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
266269
File name of CSV with isolate assignments
267270
mode (str)
268271
Type of file to read 'clusters', 'lineages', or 'external'
269-
return_type (str)
272+
return_dict (bool)
270273
If True, return a dict with sample->cluster instead
271274
of sets
272275
[default = False]

README.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,24 @@ Lees JA, Harris SR, Tonkin-Hill G, Gladstone RA, Lo SW, Weiser JN, Corander J, B
2121
Fast and flexible bacterial genomic epidemiology with PopPUNK. *Genome Research* **29**:304-316 (2019).
2222
doi:[10.1101/gr.241455.118](https://doi.org/10.1101/gr.241455.118)
2323

24-
You can also run your command with `--citation` to get a [list of citations](https://poppunk.readthedocs.io/en/latest/citing.html) and a
25-
suggested methods paragraph.
24+
You can also run your command with `--citation` to get a [list of citations](https://poppunk.readthedocs.io/en/latest/citing.html) and a suggested methods paragraph.
2625

2726
## News and roadmap
2827

2928
The [roadmap](https://poppunk.bacpop.org/roadmap.html) can be found in the documentation.
3029

31-
### 2023-01-18
30+
### 2024-08-07
31+
PopPUNK 2.7.0 comes with two changes:
32+
- Distance matrices `<db_name>.dists.npy` are no longer required or written when using
33+
`poppunk_assign`, with or without `--update-db`. These can be very large, especially
34+
with many samples, so this saves space and memory in model reuse and distribution. Note that
35+
the `<db_name>.dists.pkl` file is still required (but this is small).
36+
- We have added a `--stable` flag to `poppunk_assign`. Rather than merging hybrid clusters,
37+
new samples will simply be assigned to their nearest neighbour. This implies `--serial` and
38+
cannot be run with `--update-db`. This behaviour mimics the 'stable nomenclature' of schemes
39+
such as [LIN](https://doi.org/10.1093/molbev/msac135).
3240

41+
### 2023-01-18
3342
We have retired the PopPUNK website. Databases have been expanded, and can be
3443
found here: https://www.bacpop.org/poppunk/.
3544

@@ -45,11 +54,13 @@ change clusters).
4554
If this is a common problem let us know, as we could write a script to 'upgrade'
4655
HDBSCAN models.
4756
See issue [#213](https://github.com/bacpop/PopPUNK/issues/213) for more details.
57+
4858
### 2021-03-15
4959
We have fixed a number of bugs with may affect the use of `poppunk_assign` with
5060
`--update-db`. We have also fixed a number of bugs with GPU distances. These are
5161
'advanced' features and are not likely to be encountered in most cases, but if you do wish to use either of these features please make sure that you are using
5262
`PopPUNK >=v2.4.0` with `pp-sketchlib >=v1.7.0`.
63+
5364
### 2020-09-30
5465
We have discovered a bug affecting the interaction of pp-sketchlib and PopPUNK.
5566
If you have used `PopPUNK >=v2.0.0` with `pp-sketchlib <v1.5.1` label order may

docs/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ The advantages of PopPUNK are broadly that:
5858
- It is fast, and scalable to over :math:`10^{5}` genomes in a single run.
5959
- Assigning new query sequences to a cluster using an existing database is scalable even beyond this.
6060
- Cluster names remain consistent between studies, and other cluster labels such as MLST
61-
can be appended.
61+
can be appended. **Please note that when used as documented PopPUNK outputs stable nomenclature**.
6262
- Databases can be updated online (as sequences arrive).
6363
- Online updating is equivalent to building databases from scratch.
6464
- Databases can be kept small and managable by only keeping representative isolates.

docs/model_distribution.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Database contents
1212
A database requires the following files:
1313

1414
- ``.h5``. The sketch database, a HDF5 file.
15-
- ``.dists.pkl`` and ``.dists.npy`` files. Distances for all vs all samples in the sketch database.
15+
- ``.dists.pkl`` file. Order and names of samples in the sketch database.
1616
- ``_fit.npz`` and ``_fit.pkl`` files. Python files which describe the model fit.
1717
- ``_graph.gt``. The network relating distances, fit and strain assignment for all samples in the sketch database.
1818
- ``_clusters.csv``. The strain assignment of all samples in the sketch database.

docs/overview.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ See :doc:`query_assignment` for instructions on how to use this mode.
3232
You can think of this as being similar to using an existing MLST/cgMLST/wgMLST scheme
3333
to define your sample's strains.
3434

35+
If you want to avoid any merged clusters (and get 'stable nomenclature') use the
36+
``--stable`` flag.
37+
3538
Fit your own model
3639
^^^^^^^^^^^^^^^^^^
3740
If a database isn't available for your species, you can fit your own. This consists of three steps:

docs/query_assignment.rst

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,29 @@ Briefly, `download your reference database <https://www.bacpop.org/poppunk/>`__
88
poppunk_assign --db database --query qfile.txt \
99
--output poppunk_clusters --threads 8
1010

11+
Nomenclature
12+
------------
13+
14+
PopPUNK clusters are numbered from one upwards, in decreasing order of size in the initial
15+
dataset.
16+
17+
``poppunk_assign`` will assign your genomes into these existing clusters, with the same labels as the
18+
initial run. So cluster labels, when used as documented, **do not change**.
19+
20+
In some cases, due to undersampling of the initial dataset or emergence
21+
of hybrids, some clusters may be merged. These merged clusters will be named with
22+
underscores separating the older clusters they were merges of. Use ``--external-clustering``
23+
if you prefer other nicknames for these.
24+
25+
If you require 'stable nomenclature' where clusters never merge, use the ``--stable`` option
26+
with ``poppunk_assign``. Each query will be assigned based on its nearest neighbour's cluster,
27+
though novel clusters will still be separately identified as 'NA'.
28+
29+
Note that maintaining stable nomenclature in a dynamic population is not possible (for any
30+
nomenclature). If you are maintaining a database and want to add new queries in, you will
31+
need to use ``--update-db`` which may merge clusters. There is no way with two or more updates
32+
of giving consistent new names to merged clusters.
33+
1134
Downloading a database
1235
----------------------
1336
Current PopPUNK databases can be found here: https://www.bacpop.org/poppunk/
@@ -18,7 +41,7 @@ as queries. The clusters assigned by PopPUNK are variable-length-k-mer clusters
1841
A database called ``database`` will contain the following files, in ``database/``:
1942

2043
- ``database.h5`` -- the sketches of the reference sequences generated by ``pp-sketchlib``.
21-
- ``database.dists.npy`` and ``database.dists.pkl`` -- the core and accessory distances for
44+
- ``database.dists.pkl`` -- the order of the core and accessory distances for
2245
all pairwise comparisons in the sketch database.
2346
- ``database_fit.npz`` and ``database_fit.pkl`` -- the model fit to the core and accessory distances.
2447
- ``database_graph.gt`` -- the network defining the fit (loadable with ``graph_tool``).

0 commit comments

Comments
 (0)