-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclade_assigner.py
More file actions
executable file
·1180 lines (983 loc) · 52.2 KB
/
clade_assigner.py
File metadata and controls
executable file
·1180 lines (983 loc) · 52.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Purpose: Given a set of clade representative IDs, determine the mutually exclusive members of each clade in the tree
Author : Chase W. Nelson <chase.nelson@nih.gov>
Cite : https://github.com/chasewnelson/
Date : 2022-11-10
"""
import argparse
import numpy as np
import os
# import random
import re
import sys
import time
# from Bio import AlignIO
# from Bio.Align import AlignInfo # for .dumb_consensus()
# from Bio.Align import MultipleSeqAlignment
from collections import Counter, defaultdict
from ete3 import Tree
# from ete3.parser import newick
# from evobioinfo import GAPS, hamming, NUCS_DEFINED, NUCS_INDETERMINATE
from pprint import pprint
from typing import Dict, List, NamedTuple, Optional, TextIO
usage = """# -----------------------------------------------------------------------------
clade_assigner.py - Given representatives, determine the mutually exclusive members of each clade in the tree
# -----------------------------------------------------------------------------
For DOCUMENTATION, run:
$ clade_assigner.py --help
$ pydoc ./clade_assigner.py
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
EXAMPLE:
$ clade_assigner.py --help
# -----------------------------------------------------------------------------
"""
class Args(NamedTuple):
""" Command-line arguments """
tree: TextIO
reps: TextIO
prefix: str
permutations: int
seed: Optional[int] # TODO: add --root for specific OR random?
recursion: int
threshold: float
maxNA: float
begin_num: int
normalize: bool
suppress: bool
verbose: bool
ebullient: bool
# -----------------------------------------------------------------------------
def get_args() -> Args:
""" Get command-line arguments """
parser = argparse.ArgumentParser(
description='Determine the mutually exclusive members of each clade in the tree. HELP: clade_assigner.py --help',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Rename "optional" arguments
parser._optionals.title = 'Named arguments'
# -------------------------------------------------------------------------
# REQUIRED
parser.add_argument('-t',
'--tree',
metavar='FILE',
help='NEWICK file containing the tree(s) [REQUIRED]',
required=True,
nargs=1,
type=argparse.FileType('rt'))
parser.add_argument('-r',
'--reps',
metavar='FILE|str',
help='Sequence ID(s) (comma-separated) of clade representatives [REQUIRED]',
required=True,
nargs=1,
type=str)
# -------------------------------------------------------------------------
# OPTIONAL
parser.add_argument('-o',
'--prefix',
metavar='str',
help='Output file name PREFIX [OPTIONAL]',
required=False,
type=str,
default='<NEWICK>_clades') # detect this later
parser.add_argument('-p',
'--permutations',
metavar='int',
help='Number of permutations (random root and pruning orders) per tree [OPTIONAL]',
required=False,
type=int,
default=100)
parser.add_argument('-s',
'--seed',
help='RANDOM NUMBER SEED to be used [OPTIONAL]',
required=False,
metavar='int',
type=int,
default=None)
parser.add_argument('-R',
'--recursion',
help='System RECURSION LIMIT (may be exceeded for large trees); use with caution [OPTIONAL]',
required=False,
metavar='int',
type=int,
default=1000)
parser.add_argument('-T',
'--threshold',
help='Minimum proportion of supporting permutations required to classify a sequence [OPTIONAL]',
required=False,
metavar='float',
type=float,
default=0.95)
parser.add_argument('-m',
'--maxNA',
help='Maximum proportion of NA permutations allowed for a sequence to be assigned [OPTIONAL]',
required=False,
metavar='float',
type=float,
default=0.5)
parser.add_argument('-b',
'--begin_num',
help='Beginning tree number, useful for generating unique tree numbers for downstream '
'analyses [OPTIONAL]',
required=False,
metavar='int',
type=int,
default=1)
parser.add_argument('-n',
'--normalize',
help='Normalize evolutionary distance to largest leaf-to-rep distance [OPTIONAL]',
action='store_true')
parser.add_argument('-S',
'--suppress',
help='Suppress most output [OPTIONAL]',
action='store_true')
parser.add_argument('-v',
'--verbose',
help='Verbose output [OPTIONAL]',
action='store_true')
parser.add_argument('-e',
'--ebullient',
help='Positively ebullient output [OPTIONAL]',
action='store_true')
args = parser.parse_args()
# -------------------------------------------------------------------------
# VALIDATE arguments
# FORM the output file PREFIX if not given
if args.prefix == '<NEWICK>_clades':
args.prefix = os.path.splitext(args.tree[0].name)[0] + '_clades_p' + str(args.permutations)
# ensure THREE out_file are not present
out_file_counts = args.prefix + '_counts.tsv'
out_file_props = args.prefix + '_props.tsv'
out_file_means = args.prefix + '_means.tsv'
if os.path.isfile(out_file_counts) or os.path.isfile(out_file_props) or os.path.isfile(out_file_means):
parser.error(f'\n### ERROR: out_file="{out_file_counts}"/"{out_file_props}"/"{out_file_means}" already exist')
# if reps is file, convert to string
# print(f'reps:')
# print(args.reps)
if type(args.reps) is list and os.path.isfile(args.reps[0]):
args.reps = open(args.reps[0]).read().rstrip()
# die if reps contains anything besides integers, words, whitespace, and commas
# re_NOT_d_w_s_comma = re.compile(r'[^\d\w\s\n,]')
re_NOT_FASTA_name = re.compile(r'[^\d\w\s\n,.|]')
if re_NOT_FASTA_name.search(args.reps):
# parser.error(f'\n### ERROR: reps="{args.reps[:20]}..." may only contain integers, whitespace, and/or commas(,)')
parser.error(f'\n### ERROR: reps="{args.reps[:20]}..." may only contain words, numbers, decimals, whitespace, '
'commas(,), or pipes (|)')
if args.seed is not None and args.seed < 1:
parser.error(f'\n### ERROR: seed="{args.seed}" must be > 0')
if not 1000 <= args.recursion <= 10000:
parser.error(f'\n### ERROR: recursion="{args.recursion}" must be >= 1000 and <= 10000')
if not 0 < args.threshold <= 1:
parser.error(f'\n### ERROR: threshold="{args.threshold}" must be > 0 and <= 1')
if not args.begin_num >= 1:
parser.error(f'\n### ERROR: begin_num="{args.begin_num}" must be >= 1')
return Args(tree=args.tree[0],
reps=args.reps,
prefix=args.prefix,
permutations=args.permutations,
seed=args.seed,
recursion=args.recursion,
threshold=args.threshold,
maxNA=args.maxNA,
begin_num=args.begin_num,
normalize=args.normalize,
suppress=args.suppress,
verbose=args.verbose,
ebullient=args.ebullient)
# -----------------------------------------------------------------------------
def main() -> None:
""" The one day the sun shines in Taipei I'm coding """
start_time = time.time()
# -------------------------------------------------------------------------
# GATHER arguments
args = get_args()
tree_fh = args.tree
reps = args.reps
prefix = args.prefix
permutations = args.permutations
seed = args.seed
recursion = args.recursion
threshold = args.threshold
maxNA = args.maxNA
begin_num = args.begin_num
normalize = args.normalize
suppress = args.suppress
verbose = args.verbose
ebullient = args.ebullient
# THREE OUTPUT FILES from prefix
out_file_counts = prefix + '_counts.tsv'
out_file_props = prefix + '_props.tsv'
out_file_means = prefix + '_means.tsv'
# -------------------------------------------------------------------------
# INITIALIZE OUTPUT
print(usage)
# -------------------------------------------------------------------------
# REGEX & TUPLES
# PASS
# -------------------------------------------------------------------------
# INITIALIZE LOG
print('# -----------------------------------------------------------------------------')
print(f'LOG:command="{" ".join(sys.argv)}"')
print(f'LOG:cwd="{os.getcwd()}"')
print(f'LOG:tree="{tree_fh.name}"')
print(f'LOG:reps="{reps}"')
print(f'LOG:out_file_counts="{out_file_counts}"')
print(f'LOG:out_file_counts="{out_file_props}"')
print(f'LOG:out_file_counts="{out_file_means}"')
print(f'LOG:permutations={permutations}')
print(f'LOG:seed={seed}')
print(f'LOG:recursion={recursion}')
print(f'LOG:threshold={threshold}')
print(f'LOG:maxNA={maxNA}')
print(f'LOG:begin_num={begin_num}')
print(f'LOG:normalize={normalize}')
print(f'LOG:suppress={suppress}')
print(f'LOG:verbose={verbose}')
print(f'LOG:ebullient={ebullient}')
# RECURSION LIMIT
sys.setrecursionlimit(recursion)
# -------------------------------------------------------------------------
# INITIALIZE lists from comma-separated input
re_s = re.compile(r'\s+') # whitespace
# replace commas with whitespace
reps = reps.replace(',', ' ')
rep_seq_names = tuple(re_s.split(reps))
rep_seq_names_count = len(rep_seq_names)
rep_seq_names_uniq_count = len(set(rep_seq_names))
# LOG RESULT
print(f'RESULT:rep_seq_names_count={rep_seq_names_count}')
if verbose:
print('rep_seq_names:')
print(rep_seq_names)
if rep_seq_names_count != rep_seq_names_uniq_count:
sys.exit(f'\n### ERROR: Duplicates provided in --reps: count={rep_seq_names_count} but ' + \
f'only {rep_seq_names_uniq_count} are unique.\n')
# -------------------------------------------------------------------------
# Calculate or record random number seed for random permutations of reps
if seed is None: # emulates Slatkin Item 24
seed = np.random.randint(2 ** 32 - 1)
# Set random number seed
np.random.seed(seed) # np.random does not affect random module
print(f'LOG:seed={seed}')
print(f'RESULT:seed_rand_result={np.random.rand()}')
# -------------------------------------------------------------------------
# LOOP TREES (one per line)
# PREPARE a dict[seqID] -> list
ID_rep_count_ddl: Dict[str, Dict[str, list]] = defaultdict(dict) # ID => REP_ID => LIST OF COUNTS
ID_prop_ddl: Dict[str, Dict[str, list]] = defaultdict(dict) # ID => REP_ID => LIST OF PROPS
ID_rep_dist_ddl: Dict[str, Dict[str, list]] = defaultdict(dict) # ID => REP_ID => LIST OF PROPS
ID_count_total_dl: Dict[str, list] = defaultdict(list) #
ID_count_exp_dl: Dict[str, list] = defaultdict(list)
ID_count_assn_dl: Dict[str, list] = defaultdict(list) # ID => LIST OF COUNT SUMS
rep_seq_names_list_sorted: List[str] = []
rep_seq_names_list_sorted_wNA : List[str] = []
# initialize tree_num
tree_num = begin_num - 1 # tree_num = 0
for tree in tree_fh:
tree = tree.rstrip()
tree_num += 1
# # Debugging break
# if tree_num > 2:
# break
print('\n# -------------------------------------------------------------------------')
print(f'Tree {tree_num}')
if not suppress:
print('permutations', flush=True)
# ---------------------------------------------------------------------
# INPUT TREE using ete3
tree_node = Tree(tree)
# # Determine topology height of each node
# rep_height_topology = defaultdict(int)
# for this_rep_name in rep_seq_names:
# node = tree_node.search_nodes(name=this_rep_name)[0]
# rep_height_topology[this_rep_name] = tree_node.get_distance(node, topology_only=True)
#
# print('\nrep_height_topology:')
# pprint(rep_height_topology)
# ---------------------------------------------------------------------
# STORE DISTANCES between each sequence and each rep, ignoring self comparisons
all_leaf_names = tree_node.get_leaf_names()
leaf_to_rep_dists: Dict[str, Dict[str, float]] = defaultdict(dict)
leaf_to_closest_rep_name: Dict[str, str] = defaultdict(str)
leaf_to_closest_rep_dist: Dict[str, float] = defaultdict(float)
largest_dist = 0
for leaf_name in all_leaf_names:
# initialize
leaf_to_rep_dists[leaf_name]: Dict[str, float] = defaultdict(float)
closest_rep_name = ''
closest_rep_dist = 100000. # something arbitrary, could do better
for rep_name in rep_seq_names:
if leaf_name != rep_name:
rep_node = tree_node&rep_name
leaf_node = tree_node&leaf_name
this_dist = leaf_node.get_distance(rep_node)
# store
leaf_to_rep_dists[leaf_name][rep_name] = this_dist
# check if closest
if this_dist < closest_rep_dist:
closest_rep_name = str(rep_name)
closest_rep_dist = float(this_dist)
# check if largest
if this_dist > largest_dist:
largest_dist = this_dist
else:
closest_rep_name = 'REP'
closest_rep_dist = 0.
# store closest rep for this leaf
leaf_to_closest_rep_name[leaf_name] = closest_rep_name
leaf_to_closest_rep_dist[leaf_name] = closest_rep_dist
# NORMALIZE to LARGEST DISTANCE
if normalize:
for leaf_name in all_leaf_names:
for rep_name in rep_seq_names:
leaf_to_rep_dists[leaf_name][rep_name] = leaf_to_rep_dists[leaf_name][rep_name] / largest_dist
leaf_to_closest_rep_dist[leaf_name] = leaf_to_closest_rep_dist[leaf_name] / largest_dist
# print('leaf_to_rep_dists:')
# pprint(dict(leaf_to_rep_dists))
# print('leaf_to_closest_rep_name:')
# pprint(dict(leaf_to_closest_rep_name))
# print('leaf_to_closest_rep_dist:')
# pprint(dict(leaf_to_closest_rep_dist))
# NOTES ---
# these dictionaries have VALUES that correspond to this ONE TREE
# this is the SAME as the counts/props for the permutations, which also corresponds to THIS ONE TREE
# JUST LIKE THE MEANS of the trees -- same deal
# ---------------------------------------------------------------------
# STORE number of steps required to maximize clade members without including another representative
rep_steps_dl = defaultdict(list)
# rep_height_dl = defaultdict(list)
# rep_height_topology_dl = defaultdict(list)
# STORE assignments of each ID
seq_rep_count_dd = defaultdict(dict)
# initialize each dict of each ID
for leaf_name in all_leaf_names:
seq_rep_count_dd[leaf_name] = defaultdict(int)
# print(f'LENGTH of all_leaf_names={len(all_leaf_names)}') # LENGTH of all_leaf_names=6052 Q.E.D.
# initiate list of rep seq names that can be shuffled
rep_seq_names_list = list(rep_seq_names)
rep_seq_names_list_sorted = sorted(rep_seq_names_list)
# rep_seq_names_list_sorted = sorted(list(rep_seq_names))
# version with NA
rep_seq_names_list_sorted_wNA = sorted(list(rep_seq_names_list_sorted))
rep_seq_names_list_sorted_wNA.append('NA')
# -------------------------------------------------------------------------
# INITIALIZE the dl and ddl if FIRST TREE
if tree_num == begin_num: # if tree_num == 1:
# ID_rep_count_ddl: ID => REP_ID => LIST OF COUNTS
# ID_rep_count_ddl: Dict[str, Dict[str, list]] = defaultdict(dict)
for leaf_name in all_leaf_names:
ID_count_total_dl[leaf_name] = []
ID_count_exp_dl[leaf_name] = []
ID_count_assn_dl[leaf_name] = []
for rep_name in rep_seq_names_list_sorted_wNA: # rep_seq_names_list_sorted:
ID_rep_count_ddl[leaf_name][rep_name] = []
ID_prop_ddl[leaf_name][rep_name] = []
ID_rep_dist_ddl[leaf_name][rep_name] = []
# # NA
# ID_rep_count_ddl[leaf_name]['NA'] = []
# ID_prop_ddl[leaf_name]['NA'] = []
# -------------------------------------------------------------------------
# LOOP permutations
for i in range(permutations):
# Make a COPY of the tree for detaching
tree_node_copy = tree_node.copy()
# ROOT the TREE COPY using a RANDOMLY SELECTED representative node
random_rep_name = np.random.choice(rep_seq_names_list) # str() didn't work
# print(f'\nrandom_rep_name="{random_rep_name}" type {type(random_rep_name)}')
# PRINT PROGRESS
if not suppress:
print('{:<7}'.format(i+1) + f'root {random_rep_name} - prune ', end='')
random_rep_node = tree_node_copy.search_nodes(name=random_rep_name)[0]
tree_node_copy.set_outgroup(random_rep_node)
# DEBUG
# print(f'rep_seq_names_list="{rep_seq_names_list}"')
# Make shuffled list of reps with random root subtracted
# rep_seq_names_shuffled = list(set(rep_seq_names_list).difference(set(random_rep_name)))
rep_seq_names_shuffled = list(rep_seq_names_list)
# print(f'rep_seq_names_shuffled1="{rep_seq_names_shuffled}" type="{type(rep_seq_names_shuffled)}"')
# rep_seq_names_shuffled = [str(item) for item in rep_seq_names_shuffled]
# print(f'rep_seq_names_shuffled2="{rep_seq_names_shuffled}"')
rep_seq_names_shuffled.remove(random_rep_name)
# print(f'rep_seq_names_shuffled3="{rep_seq_names_shuffled}"')
np.random.shuffle(rep_seq_names_shuffled)
# print(f'rep_seq_names_shuffled4="{rep_seq_names_shuffled}"')
# Loop all reps and store their nodes
# if ebullient:
# rooted_tree_name = str(out_file) # TODO
# rooted_tree_name.rstrip(".tsv")
# rooted_tree_name = rooted_tree_name + '_root' + random_rep_name + ".tree"
# tree_node_copy.write(format=1, outfile=rooted_tree_name)
leaf_names_assigned = []
random_rep_order = []
# for count, this_rep_name in enumerate(rep_seq_names_list, start=1): # SHUFFLED; count is 1, 2, ..., length
for count, this_rep_name in enumerate(rep_seq_names_shuffled, start=1): # doesn't include the root
if count < len(rep_seq_names_shuffled):
# if this_rep_name != random_rep_name and count < len(rep_seq_names_list) - 1:
# if this_rep_name == random_rep_name:
# count -= 1 # don't count the matching one
# elif count < len(rep_seq_names_list) - 1:
node = tree_node_copy.search_nodes(name=this_rep_name)[0]
random_rep_order.append(this_rep_name)
# # get distance from root node
# rep_height_dl[this_rep_name].append(tree_node_copy.get_distance(node))
#
# # get topology distance from root node
# rep_height_topology_dl[this_rep_name].append(tree_node_copy.get_distance(node, topology_only=True))
# COUNTER for number of nodes visited in search for this rep's clade
steps = 0
while node:
leaf_names = node.get_leaf_names()
if verbose:
print(f'steps={steps} (n={len(leaf_names)} leaves)')
if ebullient:
print(f'steps={steps} Leaves: {leaf_names} (n={len(leaf_names)})')
# Check whether leaf names contain any other representatives
leaves_in_reps = [leaf for leaf in leaf_names if leaf in rep_seq_names]
# if verbose:
# print(f'leaves_in_reps: {leaves_in_reps} (n={len(leaves_in_reps)})')
# print("\n================================================================================")
# WENT TOO FAR; die when matching other ref
if len(leaves_in_reps) > 1:
if verbose:
print(f'At steps={steps}, included {len(leaves_in_reps)} representatives; store steps={steps - 1}')
print(f'Found clade containing the {this_rep_name} representative, {steps} nodes from leaf')
# rep_steps[this_rep_node_name] = steps
rep_steps_dl[this_rep_name].append(steps - 1) # because we went too far
if verbose:
print("\n================================================================================")
print("================================================================================")
break
else:
# Move up the tree
node = node.up
steps += 1
# FIND CLADE NODE & DETACH
node = tree_node_copy.search_nodes(name=this_rep_name)[0]
step_counter = 0
for i in range(steps - 1): # we stored the the right number of steps; for 2 steps, want len([0,1]] steps
step_counter += 1
node = node.up
if verbose:
print(f'Traversed step_counter={step_counter} steps')
# GET LEAVES and SAVE DATA
leaf_names = node.get_leaf_names()
if verbose:
print(f'leaf_names (n={len(leaf_names)}):')
print(f'n={len(leaf_names)} leaves assigned to clade={this_rep_name}')
# leaf_names_counter_dict = dict(Counter(leaf_names))
if ebullient:
print(leaf_names)
for leaf_name in leaf_names:
if verbose:
print(f'name={leaf_name}, this_rep_name={this_rep_name}')
seq_rep_count_dd[leaf_name][this_rep_name] += 1
leaf_names_assigned.extend(leaf_names)
# DETACH
node.detach()
# TODO: REMAINING leaves are classified as NA
leaf_names_NA = sorted(list(set(all_leaf_names).difference(set(leaf_names_assigned))))
# print(f'len(leaf_names_NA)={len(leaf_names_NA)}')
for leaf_name in leaf_names_NA:
seq_rep_count_dd[leaf_name]['NA'] += 1
# Print order of pruning
if not suppress:
print(','.join(random_rep_order)) # + f' (n={len(random_rep_order)})')
# print("\n================================================================================")
# print('RESULTS')
#
# print('\nrep_steps:')
# pprint(rep_steps)
#
# print('\nrep_height:')
# pprint(rep_height)
#
# print('\nrep_height_topology:')
# pprint(rep_height_topology)
#
# # Sort by topological height
# rep_height_topology_sorted_ids = [id for dist, id in sorted(zip(rep_height_topology.values(), rep_height_topology.keys()), reverse=True)]
# print('\nrep_height_topology_sorted_ids:')
# pprint(rep_height_topology_sorted_ids)
# print('\nmax_height_node:')
print("\n================================================================================")
print('RESULTS')
if verbose:
print('\nrep_steps_dl:')
print(rep_steps_dl)
# COUNT times each number of steps observed
if not suppress:
print('\nrep_steps_dl - max steps toward root before another clade representative included (STEPS: PERMUTS):')
for name in rep_seq_names:
step_counts = Counter(rep_steps_dl[name])
print(f'{name}:')
pprint(step_counts)
if verbose:
# print('\nrep_height_dl:')
# print(rep_height_dl)
print('\nseq_rep_count_dd:')
pprint(seq_rep_count_dd)
# print('\nrep_height_topology_dl:')
# print(rep_height_topology_dl)
# -------------------------------------------------------------------------
# Calculate the PROPORTIONAL assignment of each sequence to each clade
# verify correct number of sequences
print(f'RESULT:num_seqs_logged={len(seq_rep_count_dd)}')
# PRINT proportions to output file
# initialize output file
# out_file_hdl = open(out_file, "wt")
out_file_tree_count = f'{prefix}_tree{tree_num}_counts.tsv'
out_file_tree_count_hdl = open(out_file_tree_count, "wt")
out_file_tree_prop = f'{prefix}_tree{tree_num}_props.tsv'
out_file_tree_prop_hdl = open(out_file_tree_prop, "wt")
out_file_tree_dist = f'{prefix}_tree{tree_num}_dists.tsv'
out_file_tree_dist_hdl = open(out_file_tree_dist, "wt")
# FORM header for counts and props
header_list = ['seq_name', 'permut_total', 'permut_assn_exp', 'permut_assn_obs', 'permut_assn_NA']
header_list.extend(rep_seq_names_list_sorted_wNA)
# FORM heade for dists
header_dists_list = ['seq_name', 'permut_total', 'permut_assn_exp', 'permut_assn_obs', 'permut_assn_NA',
'closest_rep', 'closest_rep_dist']
# additional columns for DISTANCES to each rep
rep_seq_names_list_sorted_dist = []
for this_rep in rep_seq_names_list_sorted:
rep_seq_names_list_sorted_dist.append(this_rep + '_dist')
# header_dists_list.append(this_rep + '_dist')
header_dists_list.extend(rep_seq_names_list_sorted_dist)
# list all the reps twice?
# leaf_to_rep_dists
# leaf_to_closest_rep_name
# leaf_to_closest_rep_dist
# header_list.append('NA')
header = '\t'.join(header_list)
header_dists = '\t'.join(header_dists_list)
# PRINT header
# print(header)
out_file_tree_count_hdl.write(header + '\n')
out_file_tree_prop_hdl.write(header + '\n')
out_file_tree_dist_hdl.write(header_dists + '\n')
# TODO: I guess one solution is simply to make LISTS at THIS step instead of single values
# Then a subsequent step would compute the means (or whatever) of those lists
# This would also allow a printing of two versions: one of the, say, comma-separated lists, the other their means
# ID_count_total_dl[leaf_name] = []
# ID_count_exp_dl[leaf_name] = []
# ID_count_assn_dl[leaf_name] = []
# LOOP ALL seqs, print proportional assignment to each representative (column)
for name in sorted(seq_rep_count_dd.keys()):
# if name in rep_seq_names: # was thinking to treat the representatives themselves specially, but naw
# count_list: List = []
count_dict: Dict[str, int] = defaultdict(int)
# LOOP REP seqs (output columns), append number of times seq assigned
for rep_name in rep_seq_names_list_sorted_wNA: # rep_seq_names_list_sorted:
# line_list.append(seq_rep_count_dd[name][rep_name])
# count_list.append(seq_rep_count_dd[name][rep_name]) # NUMBER of times this SEQ assigned to this REP
count_dict[rep_name] = seq_rep_count_dd[name][rep_name]
ID_rep_count_ddl[name][rep_name].append(seq_rep_count_dd[name][rep_name])
# # SPECIALLY ADD NA
# count_list.append(seq_rep_count_dd[name]['NA']) # NUMBER of times this SEQ assigned to this REP
# ID_rep_count_ddl[name]['NA'].append(seq_rep_count_dd[name]['NA'])
exp_permut = permutations * (len(rep_seq_names_list) - 2) / len(rep_seq_names_list) # NOT with NA
# line_list = [name, exp_permut]
# convert counts to proportions and append to line_list
# count_sum = sum(count_list)
count_total_sum = sum(count_dict.values())
count_assn_sum = sum(count_dict.values()) - count_dict['NA']
# INITIATE line lists
tree_count_line_list = [name, count_total_sum, exp_permut, count_assn_sum]
tree_prop_line_list = [name, count_total_sum, exp_permut, count_assn_sum]
tree_dist_line_list = [name, count_total_sum, exp_permut, count_assn_sum]
ID_count_total_dl[name].append(count_total_sum)
ID_count_exp_dl[name].append(exp_permut)
ID_count_assn_dl[name].append(count_assn_sum)
# add NA first for ALL
if count_total_sum > 0: # always true
this_prop = count_dict['NA'] / count_total_sum
tree_count_line_list.append(count_dict['NA'])
ID_prop_ddl[name]['NA'].append(this_prop)
tree_prop_line_list.append(this_prop)
tree_dist_line_list.append(this_prop)
else:
tree_count_line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
ID_prop_ddl[name]['NA'].append('NA')
tree_prop_line_list.append('NA')
tree_dist_line_list.append('NA')
# TODO: add the closest rep and its distance
tree_dist_line_list.append(leaf_to_closest_rep_name[name])
tree_dist_line_list.append(leaf_to_closest_rep_dist[name])
# add all reps (not NA)
for rep_name in rep_seq_names_list_sorted: # no NA
count = count_dict[rep_name]
if count_assn_sum > 0:
this_prop = count / count_assn_sum
tree_count_line_list.append(count)
tree_prop_line_list.append(this_prop) # PROPORTION of times this SEQ assigned to this REP
ID_prop_ddl[name][rep_name].append(this_prop)
else:
tree_count_line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
ID_prop_ddl[name][rep_name].append('NA')
tree_prop_line_list.append('NA')
# TODO REGARDLESS, the distance
# leaf_to_rep_dists
# leaf_to_closest_rep_name
# leaf_to_closest_rep_dist
# append dist to line
tree_dist_line_list.append(leaf_to_rep_dists[name][rep_name])
# append dist to master list
ID_rep_dist_ddl[name][rep_name].append(leaf_to_rep_dists[name][rep_name])
# Add NA LAST for COUNTS ONLY
if count_total_sum > 0:
this_prop = count_dict['NA'] / count_total_sum
tree_count_line_list.append(count_dict['NA'])
tree_prop_line_list.append(this_prop) # PROPORTION of times this SEQ assigned to this REP
# ID_prop_ddl[name]['NA'].append(this_prop)
else:
tree_count_line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
tree_prop_line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
# ID_prop_ddl[name]['NA'].append('NA')
# for i, count in enumerate(count_list):
# if count_sum > 0:
# this_prop = count / count_sum
# line_list.append(this_prop) # PROPORTION of times this SEQ assigned to this REP
# ID_prop_ddl[name][rep_seq_names_list_sorted_wNA[i]].append(this_prop)
# else:
# line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
# ID_prop_ddl[name][rep_seq_names_list_sorted_wNA[i]].append('NA')
# ASSEMBLE LINE
tree_count_line = '\t'.join(map(str, tree_count_line_list))
tree_prop_line = '\t'.join(map(str, tree_prop_line_list))
tree_dist_line = '\t'.join(map(str, tree_dist_line_list))
# print(line)
# WRITE
out_file_tree_count_hdl.write(tree_count_line + '\n')
out_file_tree_prop_hdl.write(tree_prop_line + '\n')
out_file_tree_dist_hdl.write(tree_dist_line + '\n')
# CLOSE
out_file_tree_count_hdl.close()
out_file_tree_prop_hdl.close()
out_file_tree_dist_hdl.close()
# ---------------------------------------------------------------------
# TODO check for convergence WITHIN tree
# FINISH LAST TREE
# -------------------------------------------------------------------------
# MEAN RESULTS OVER ALL TRESS
# PRINT results to THREE FILES
# out_file_counts = prefix + '_counts.tsv'
# out_file_props = prefix + '_props.tsv'
# out_file_means = prefix + '_means.tsv'
out_file_counts_hdl = open(out_file_counts, "wt")
out_file_props_hdl = open(out_file_props, "wt")
out_file_means_hdl = open(out_file_means, "wt")
# header
header_start_list = ['seq_name', 'permut_total', 'permut_assn_exp', 'permut_assn_obs', 'permut_assn_NA']
# header_start_list.extend(rep_seq_names_list_sorted_wNA)
header_start = '\t'.join(header_start_list)
# print(header)
out_file_counts_hdl.write(header_start + '\t' + '\t'.join(rep_seq_names_list_sorted) + '\n')
out_file_props_hdl.write(header_start + '\t' + '\t'.join(rep_seq_names_list_sorted) + '\n')
out_file_means_hdl.write(header_start + '\t' +
'closest_rep\tclosest_rep_dist\t' +
f'top_rep\ttop_rep_prop\tassignment_{threshold}\t' +
'\t'.join(rep_seq_names_list_sorted) + '\t' +
'\t'.join(rep_seq_names_list_sorted_dist) + '\n')
##
# line_list = [name, count_total_sum, exp_permut, count_assn_sum, count_dict['NA']]
#
# ID_count_total_dl[name].append(count_total_sum)
# ID_count_exp_dl[name].append(exp_permut)
# ID_count_assn_dl[name].append(count_assn_sum)
#
# # add NA first
# if count_total_sum > 0: # always true
# this_prop = count_dict['NA'] / count_total_sum
# line_list.append(count_dict['NA'])
# ID_prop_ddl[name]['NA'].append(this_prop)
# else:
# line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
# ID_prop_ddl[name]['NA'].append('NA')
#
# # add all reps (not NA)
# for rep_name, count in count_dict.items():
# if rep_name != 'NA':
# if count_assn_sum > 0:
# this_prop = count / count_assn_sum
# line_list.append(count) # PROPORTION of times this SEQ assigned to this REP
# # line_list.append(this_prop) # PROPORTION of times this SEQ assigned to this REP
# ID_prop_ddl[name][rep_name].append(this_prop)
# else:
# line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
# ID_prop_ddl[name][rep_name].append('NA')
#
# for name in sorted(seq_rep_count_dd.keys()):
# # if name in rep_seq_names: # was thinking to treat the representatives themselves specially, but naw
# # count_list: List = []
# count_dict: Dict[str, int] = defaultdict(int)
#
# # LOOP REP seqs (output columns), append number of times seq assigned
# for rep_name in rep_seq_names_list_sorted_wNA: # rep_seq_names_list_sorted:
# # line_list.append(seq_rep_count_dd[name][rep_name])
# # count_list.append(seq_rep_count_dd[name][rep_name]) # NUMBER of times this SEQ assigned to this REP
# count_dict[rep_name] = seq_rep_count_dd[name][rep_name] # TODO ended here
# ID_rep_count_ddl[name][rep_name].append(seq_rep_count_dd[name][rep_name])
#
# # # SPECIALLY ADD NA
# # count_list.append(seq_rep_count_dd[name]['NA']) # NUMBER of times this SEQ assigned to this REP
# # ID_rep_count_ddl[name]['NA'].append(seq_rep_count_dd[name]['NA'])
#
# exp_permut = permutations * (len(rep_seq_names_list) - 2) / len(rep_seq_names_list) # NOT with NA
# line_list = [name, exp_permut]
# ID_exp_permut_dl[name].append(exp_permut)
#
# # convert counts to proportions and append to line_list
# # count_sum = sum(count_list)
# count_sum = sum(count_dict.values()) - count_dict['NA']
# line_list.append(count_sum)
# ID_count_sum_dl[name].append(count_sum)
#
# for rep_name, count in count_dict.items():
# if count_sum > 0:
# this_prop = count / count_sum
# line_list.append(count) # PROPORTION of times this SEQ assigned to this REP
# # line_list.append(this_prop) # PROPORTION of times this SEQ assigned to this REP
# ID_prop_ddl[name][rep_name].append(this_prop)
# else:
# line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
# ID_prop_ddl[name][rep_name].append('NA')
#
# # for i, count in enumerate(count_list):
# # if count_sum > 0:
# # this_prop = count / count_sum
# # line_list.append(this_prop) # PROPORTION of times this SEQ assigned to this REP
# # ID_prop_ddl[name][rep_seq_names_list_sorted_wNA[i]].append(this_prop)
# # else:
# # line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
# # ID_prop_ddl[name][rep_seq_names_list_sorted_wNA[i]].append('NA')
#
# line = '\t'.join(map(str, line_list))
# # print(line)
# out_file_hdl.write(line + '\n')
#
# # for name in sorted(seq_rep_count_dd.keys()):
# # # if name in rep_seq_names: # was thinking to treat the representatives themselves specially, but naw
# # # count_list: List = []
# # count_dict: Dict[str, int] = defaultdict(int)
# #
# # # LOOP REP seqs (output columns), append number of times seq assigned
# # for rep_name in rep_seq_names_list_sorted_wNA: # rep_seq_names_list_sorted:
# # # line_list.append(seq_rep_count_dd[name][rep_name])
# # # count_list.append(seq_rep_count_dd[name][rep_name]) # NUMBER of times this SEQ assigned to this REP
# # count_dict[rep_name] = seq_rep_count_dd[name][rep_name] # TODO ended here
# # ID_rep_count_ddl[name][rep_name].append(seq_rep_count_dd[name][rep_name])
# #
# # # # SPECIALLY ADD NA
# # # count_list.append(seq_rep_count_dd[name]['NA']) # NUMBER of times this SEQ assigned to this REP
# # # ID_rep_count_ddl[name]['NA'].append(seq_rep_count_dd[name]['NA'])
# #
# # exp_permut = permutations * (len(rep_seq_names_list) - 2) / len(rep_seq_names_list) # NOT with NA
# # line_list = [name, exp_permut]
# # ID_exp_permut_dl[name].append(exp_permut)
# #
#
# #
# # # for i, count in enumerate(count_list):
# # # if count_sum > 0:
# # # this_prop = count / count_sum
# # # line_list.append(this_prop) # PROPORTION of times this SEQ assigned to this REP
# # # ID_prop_ddl[name][rep_seq_names_list_sorted_wNA[i]].append(this_prop)
# # # else:
# # # line_list.append('NA') # this is for a numerical VALUE, not assigned clade name
# # # ID_prop_ddl[name][rep_seq_names_list_sorted_wNA[i]].append('NA')
# #
# # line = '\t'.join(map(str, line_list))
# # # print(line)
# # out_file_hdl.write(line + '\n')
# #
# #
# #
# OUTPUT:
# ID_rep_count_ddl: Dict[str, Dict[str, list]] = defaultdict(dict) # ID => REP_ID => LIST OF COUNTS
# ID_prop_ddl: Dict[str, Dict[str, list]] = defaultdict(dict) # ID => REP_ID => LIST OF PROPS
# ID_count_total_dl[leaf_name] = []
# ID_count_exp_dl[leaf_name] = []
# ID_count_assn_dl[leaf_name] = [] # ID => LIST OF COUNT SUMS
for name in sorted(ID_rep_count_ddl.keys()):
count_list = [] # each item is a comma-separated string
prop_list = []
mean_list = []
dist_list = []
# count_dict: Dict[str, str] = defaultdict(str)
# prop_dict: Dict[str, str] = defaultdict(str)
# mean_dict: Dict[str, float] = defaultdict(float)
# NA first
count_list.append(','.join(map(str, ID_rep_count_ddl[name]['NA']))) # NUM times SEQ assigned to REP
prop_list.append(','.join(map(str, ID_prop_ddl[name]['NA'])))
this_NA_mean = np.mean(ID_prop_ddl[name]['NA']) # this is of TOTAL, not of ASSIGNED
# mean_list.append(this_NA_mean) # put it in another place later
# Track top assn and value
top_rep_choice = 'NA'
top_rep_prop = 0.
rep_assigned = 'NA'
# Track closest rep and dist
closest_rep = 'NA'
closest_rep_d = 100000. # something arbitrary, could do better
# TODO: here, get the mean distance for each rep_name and save in a dict
# then, the mean dist to each rep can be calculated and the lowest mean determined
# INITIALIZE mean distances; this refers to ONLY THIS SEQ NAME
# rep_dist_dl: Dict[str, list] = defaultdict(list) # REP_ID => LIST OF DISTS
for rep_name in rep_seq_names_list_sorted: # no NA
# -----------------------------------------------------------------
# COUNTS and PROPS
# count_dict[rep_name] = ','.join(map(str, ID_rep_count_ddl[name][rep_name])) # NUM times SEQ assigned to REP
# prop_dict[rep_name] = ','.join(map(str, ID_prop_ddl[name][rep_name]))
# mean_dict[rep_name] = np.mean(ID_prop_ddl[name][rep_name])
count_list.append(','.join(map(str, ID_rep_count_ddl[name][rep_name]))) # NUM times SEQ assigned to REP
prop_list.append(','.join(map(str, ID_prop_ddl[name][rep_name])))
# this_rep_mean = np.mean(ID_prop_ddl[name][rep_name])
# this_rep_mean = np.mean(map(float, ID_prop_ddl[name][rep_name]))
this_rep_numer = 0
this_rep_denom = 0