33import argparse
44import numpy as np
55import xpclr
6- from xpclr import util
76import os
7+ import logging
88
9+ LOGFORMAT = '%(asctime)s : %(levelname)s : %(message)s'
910
10- # Argument parsing
11- psr = argparse .ArgumentParser (
12- description = 'Tool to calculate XP-CLR as per Chen, Patterson, Reich 2010' )
1311
14- # files:
15- psr .add_argument ('--out' , "-O" , required = True , help = 'output file' )
12+ def main ():
13+ # Argument parsing
14+ psr = argparse .ArgumentParser (
15+ description = 'Tool to calculate XP-CLR as per Chen, Patterson, Reich 2010' )
1616
17- # data inputs for hdf5 format:
18- psr .add_argument ('--hdf5' , "-I" , required = False , help = 'input hdf5 filestem' ,
19- default = None , action = 'store' )
17+ # files:
18+ psr .add_argument ('--out' , "-O" , required = True , help = 'output file' )
2019
21- psr .add_argument ('--gdisthdf5 ' , required = False , default = None , type = str ,
22- help = 'key for genetic position in variants table of hdf5 ' )
20+ psr .add_argument ('--format ' , "-F" , required = False , default = "vcf" ,
21+ help = 'input expected. One of "vcf" (default), "hdf5", or "txt" ' )
2322
24- psr . add_argument ( '--samplesA' , '-Sa' , action = 'store' , default = None ,
25- dest = 'samplesA ' , required = False ,
26- help = 'Which samples comprise population A. Comma separated ' )
23+ # data inputs for hdf5/VCF format:
24+ psr . add_argument ( '--input' , '-I ' , required = False , help = 'input file vcf or hdf5' ,
25+ default = None , action = 'store ' )
2726
28- psr .add_argument ('--samplesB' , '-Sb' , action = 'store' , default = None ,
29- dest = 'samplesB' , required = False ,
30- help = 'Which samples comprise population B. Comma separated' )
27+ psr .add_argument ('--gdistkey' , required = False , default = None , type = str ,
28+ help = 'key for genetic position in variants table of hdf5/VCF' )
3129
32- psr .add_argument ('--rrate' , '-R' , dest = 'rrate' , action = 'store' ,
33- default = 1e-8 , help = 'recombination rate per base' )
30+ psr .add_argument ('--samplesA' , '-Sa' , action = 'store' , default = None ,
31+ dest = 'samplesA' , required = False ,
32+ help = 'Samples comprising population A. Comma separated list or path to file with each ID on a line' )
3433
34+ psr .add_argument ('--samplesB' , '-Sb' , action = 'store' , default = None ,
35+ dest = 'samplesB' , required = False ,
36+ help = 'Samples comprising population B. Comma separated list or path to file with each ID on a line' )
3537
36- # data inputs for text format
37- psr .add_argument ('--map ' , required = False , default = None , action = 'store' ,
38- help = 'input map file as per XPCLR specs ' )
38+ # only used when input is hdf5 or VCF
39+ psr .add_argument ('--rrate ' , '-R' , dest = 'rrate' , action = 'store' ,
40+ default = 1e-8 , help = 'recombination rate per base ' )
3941
40- psr .add_argument ('--popA' , required = False , default = None , action = 'store' ,
41- help = 'filepath to population A genotypes' )
42+ # data inputs for text format
43+ psr .add_argument ('--map' , required = False , default = None , action = 'store' ,
44+ help = 'input map file as per XPCLR specs' )
4245
43- psr .add_argument ('--popB ' , required = False , default = None , action = 'store' ,
44- help = 'filepath to population A genotypes' )
46+ psr .add_argument ('--popA ' , required = False , default = None , action = 'store' ,
47+ help = 'filepath to population A genotypes' )
4548
46- # chrom
47- psr .add_argument ('--chr' , '-C' , required = True , dest = 'chrom' , action = 'store' ,
48- help = 'Which contig analysis is based on' )
49+ psr .add_argument ('--popB' , required = False , default = None , action = 'store' ,
50+ help = 'filepath to population A genotypes' )
4951
50- # parameters
51- psr .add_argument ('--ld' , '-L' , dest = 'ldcutoff' , action = 'store' ,
52- default = 0.95 , help = 'LD cutoff to apply for weighting' )
52+ # parameters
53+ # chrom
54+ psr .add_argument ('--chr' , '-C' , required = True , dest = 'chrom' , action = 'store' ,
55+ help = 'Which contig analysis is based on' )
5356
54- psr .add_argument ('--phased ' , '-P ' , dest = 'phased ' , action = 'store_true ' ,
55- help = 'whether data is phased for more precise r2 calculation ' )
57+ psr .add_argument ('--ld ' , '-L ' , dest = 'ldcutoff ' , action = 'store ' ,
58+ default = 0.95 , help = 'LD cutoff to apply for weighting ' )
5659
57- psr .add_argument ('--verbose ' , '-V ' , dest = 'verbose ' , action = 'store_true' ,
58- help = 'whether to be verbose ' )
60+ psr .add_argument ('--phased ' , '-P ' , dest = 'phased ' , action = 'store_true' ,
61+ help = 'whether data is phased for more precise r2 calculation ' )
5962
60- psr .add_argument ('--maxsnps' , '-M' , dest = 'maxsnps' , action = 'store' ,
61- default = 200 , help = 'max SNPs in a window' )
62- psr .add_argument ('--minsnps' , '-N' , dest = 'minsnps' , action = 'store' ,
63- default = 10 , help = 'min SNPs in a window' )
63+ psr .add_argument ('--verbose' , '-V' , dest = 'verbose' , action = 'store' , default = 20 , type = int ,
64+ help = 'How verbose to be in logging. 10=DEBUG, 20=INFO, 30=WARN, 40=ERROR, 50=CRITICAL' )
6465
65- psr .add_argument ('--size' , dest = 'size' , action = 'store' ,
66- default = 20000 , help = 'window size' , type = int )
67- psr .add_argument ('--start' , dest = 'start' , action = 'store' , type = int ,
68- default = 1 , help = 'start base position for windows' )
69- psr .add_argument ('--stop' , dest = 'stop' , action = 'store' , default = None , type = int ,
70- help = 'stop base position for windows' )
71- psr .add_argument ('--step' , dest = 'step' , action = 'store' , default = 20000 , type = int ,
72- help = 'windows step for slide' )
66+ psr .add_argument ('--maxsnps' , '-M' , dest = 'maxsnps' , action = 'store' ,
67+ default = 200 , help = 'max SNPs in a window' )
68+ psr .add_argument ('--minsnps' , '-N' , dest = 'minsnps' , action = 'store' ,
69+ default = 10 , help = 'min SNPs in a window' )
70+
71+ psr .add_argument ('--size' , dest = 'size' , action = 'store' ,
72+ default = 20000 , help = 'window size in base pairs' , type = int )
73+ psr .add_argument ('--start' , dest = 'start' , action = 'store' , type = int ,
74+ default = 1 , help = 'start base position for windows' )
75+ psr .add_argument ('--stop' , dest = 'stop' , action = 'store' , default = None , type = int ,
76+ help = 'stop base position for windows' )
77+ psr .add_argument ('--step' , dest = 'step' , action = 'store' , default = 20000 , type = int ,
78+ help = 'step size for sliding windows' )
79+
80+ args = psr .parse_args ()
7381
74- args = psr . parse_args ( )
82+ logging . basicConfig ( format = LOGFORMAT , level = args . verbose , datefmt = "%Y-%m-%d %H:%M:%S" )
7583
84+ fn = args .out
85+ outdir = os .path .dirname (fn )
7686
77- fn = args . out
78- outdir = os . path . dirname ( fn )
87+ assert os . access ( outdir , os . W_OK ), \
88+ "No permission to write in the specified directory: {0}" . format ( outdir )
7989
80- assert os . access ( outdir , os . W_OK ), \
81- "No permission to write in the specified directory: {0}" . format ( outdir )
90+ logging . info ( "running xpclr v{0}" . format ( xpclr . __version__ ))
91+ chromosome = args . chrom
8292
83- print ( "xpclr v{0}" . format ( xpclr . __version__ ))
84- chromosome = args .chrom
93+ # if mode is "hdf5"
94+ if args .format == 'vcf' :
8595
86- # if mode is "hdf5"
87- if args .hdf5 is not None :
96+ logging .info ("Loading VCF" )
97+ g1 , g2 , positions , genetic_dist = xpclr .util .load_vcf_format_data (
98+ args .input .strip (),
99+ chromosome ,
100+ args .samplesA ,
101+ args .samplesB ,
102+ gdistkey = args .gdistkey )
103+ logging .info ("VCF loading complete" )
88104
89- samples1 = [sample_id .strip () for sample_id in args .samplesA .split ("," )]
90- samples2 = [sample_id .strip () for sample_id in args .samplesB .split ("," )]
105+ elif args .format == 'hdf5' :
91106
92- g1 , g2 , positions , genetic_dist = util .load_hdf5_data (
93- args .hdf5 .strip (),
94- chromosome ,
95- samples1 ,
96- samples2 ,
97- gdistkey = args .gdisthdf5 )
98-
99- else :
100- # else if mode is text
101- g1 , g2 , positions , genetic_dist = util .load_text_format_data (
102- args .map ,
103- args .popA ,
104- args .popB )
105-
106-
107- ac1 = g1 .count_alleles ()
108- ac2 = g2 .count_alleles ()
109-
110- print ("TOTAL: {0} SNPs are in the provided input files" .format (g1 .shape [0 ]))
111- multiallelic = (ac1 [:, 2 :].sum (1 ) > 0 ) | (ac2 [:, 2 :].sum (1 ) > 0 )
112- print ("EXCLUDING: {0} SNPs as multiallelic " .format (multiallelic .sum ()))
113-
114- # all missing in either
115- missing = (np .array (ac1 .sum (1 )) == 0 ) | (np .array (ac2 .sum (1 )) == 0 )
116- print ("EXCLUDING: {0} SNPs as missing in all samples in a population"
117- .format (np .sum (missing & ~ multiallelic )))
118-
119- # drop if fixed in AC2,
120- fixed_p2 = ac2 .is_non_segregating ()[:] | \
121- ac2 .is_singleton (0 )[:] | \
122- ac2 .is_singleton (1 )[:]
123- print ("EXCLUDING: {0} SNPs as invariant or singleton in population 2"
124- .format (np .sum (fixed_p2 & ~ missing & ~ multiallelic )))
125-
126- # now compress all!
127- include = (~ multiallelic & ~ fixed_p2 & ~ missing )
128- print ("TOTAL: {0} SNPs included in the analysis" .format (include .sum ()))
129-
130- g1 = g1 .compress (include , axis = 0 )
131- g2 = g2 .compress (include , axis = 0 )
132- positions = np .compress (include , positions , axis = 0 )
133- if genetic_dist is not None :
134- genetic_dist = np .compress (include , genetic_dist , axis = 0 )
135- print ("..done dropping above SNPs from analysis." )
136-
137- # check everything is as expected.
138- assert g1 .shape [0 ] == g2 .shape [0 ] == positions .shape [0 ]
139-
140- # determine windows
141- if args .stop is None :
142- args .stop = positions [- 1 ]
143- spacing = np .arange (args .start , args .stop , args .step )
144- scan_windows = np .vstack ([spacing , spacing - 1 + args .size ]).T
145-
146- # main function
147- modelL , nullL , selcoef , nsnps , navail , snpedges = \
148- xpclr .xpclr_scan (g1 , g2 , positions , scan_windows , geneticd = genetic_dist ,
149- ldcutoff = args .ldcutoff , phased = args .phased ,
150- maxsnps = args .maxsnps , minsnps = args .minsnps ,
151- rrate = args .rrate , verbose = args .verbose )
152-
153- df = util .tabulate_results (chromosome , modelL , nullL , selcoef ,
154- nsnps , navail , scan_windows , snpedges )
155-
156- df .to_csv (fn , sep = "\t " , index = False )
107+ logging .info ("Loading HDF5" )
108+ g1 , g2 , positions , genetic_dist = xpclr .util .load_hdf5_data (
109+ args .input .strip (),
110+ chromosome ,
111+ args .samplesA ,
112+ args .samplesB ,
113+ gdistkey = args .gdistkey )
114+ logging .info ("HDF5 loading complete" )
115+
116+ elif args .format == 'txt' :
117+ # else if mode is text
118+ logging .info ("Loading TXT" )
119+ g1 , g2 , positions , genetic_dist = xpclr .util .load_text_format_data (
120+ args .map ,
121+ args .popA ,
122+ args .popB )
123+ logging .info ("TXT loading complete" )
124+ else :
125+ raise ValueError ("Format must be one of vcf/hdf5/txt" )
126+
127+ ac1 = g1 .count_alleles ()
128+ ac2 = g2 .count_alleles ()
129+
130+ logging .info ("{0:,} SNPs in total are in the provided input files" .format (g1 .shape [0 ]))
131+
132+ multiallelic = (ac1 [:, 2 :].sum (1 ) > 0 ) | (ac2 [:, 2 :].sum (1 ) > 0 )
133+ logging .info ("{0:,} SNPs excluded as multiallelic" .format (multiallelic .sum ()))
134+
135+ # all missing in either
136+ missing = (np .array (ac1 .sum (1 )) == 0 ) | (np .array (ac2 .sum (1 )) == 0 )
137+ logging .info (
138+ "{0:,} SNPs excluded as missing in all samples in a population" .format (np .sum (missing & ~ multiallelic )))
139+
140+ # drop if fixed in AC2,
141+ fixed_p2 = ac2 .is_non_segregating ()[:] | ac2 .is_singleton (0 )[:] | ac2 .is_singleton (1 )[:]
142+
143+ logging .info (
144+ "{0:,} SNPs excluded as invariant or singleton in population 2" .format (
145+ np .sum (fixed_p2 & ~ missing & ~ multiallelic )))
146+
147+ # now compress all!
148+ include = (~ multiallelic & ~ fixed_p2 & ~ missing )
149+ logging .info (
150+ "{0:,}/{1:,} SNPs included in the analysis ({2:.2f}%)" .format (
151+ include .sum (), g1 .shape [0 ], 100 * include .sum ()/ g1 .shape [0 ]))
152+
153+ g1 = g1 .compress (include , axis = 0 )
154+ g2 = g2 .compress (include , axis = 0 )
155+ positions = np .compress (include , positions , axis = 0 )
156+ if genetic_dist is not None :
157+ genetic_dist = np .compress (include , genetic_dist , axis = 0 )
158+ logging .info ("Done dropping above SNPs from analysis. XP-CLR algorithm starting." )
159+
160+ # check everything is as expected.
161+ assert g1 .shape [0 ] == g2 .shape [0 ] == positions .shape [0 ]
162+
163+ # determine windows
164+ if args .stop is None :
165+ args .stop = positions [- 1 ]
166+ spacing = np .arange (args .start , args .stop , args .step )
167+ scan_windows = np .vstack ([spacing , spacing - 1 + args .size ]).T
168+
169+ # main function
170+ model_lik , null_lik , selcoef , nsnps , navail , snpedges = xpclr .methods .xpclr_scan (
171+ g1 , g2 , positions , scan_windows , geneticd = genetic_dist ,
172+ ldcutoff = args .ldcutoff , phased = args .phased ,
173+ maxsnps = args .maxsnps , minsnps = args .minsnps ,
174+ rrate = args .rrate )
175+
176+ df = xpclr .util .tabulate_results (
177+ chromosome , model_lik , null_lik , selcoef , nsnps , navail , scan_windows , snpedges )
178+
179+ df .to_csv (fn , sep = "\t " , index = False )
180+ logging .info ("Analysis complete. Output file {0}" .format (fn ))
181+
182+
183+ if __name__ == '__main__' :
184+ main ()
0 commit comments