1616from . import summarize
1717from . import vdj_gather
1818from . import vdj_match
19- from . import tab2seq
19+ from . import convert
2020from . import show
2121from .util import IgSeqError
2222from .version import __version__
@@ -38,6 +38,22 @@ def wrap(txt):
3838 chunks = txt .strip ().split ("\n \n " )
3939 return "\n \n " .join ([wrap (chunk ) for chunk in chunks ])
4040
41+ def args_to_colmap (args ):
42+ """Make dictionary of column name mappings from cmd-line arguments.
43+
44+ This is used for commands that work with tabular inputs/outputs.
45+ """
46+ # convert arguments like "col_seq_id" to "sequence_id"
47+ colmap = {}
48+ longer = {"desc" : "description" , "seq" : "sequence" , "qual" : "quality" }
49+ for key , val in vars (args ).items ():
50+ if key .startswith ("col" ) and val is not None :
51+ key_long = key .split ("_" )[1 :]
52+ key_long = [longer .get (word , word ) for word in key_long ]
53+ key_long = "_" .join (key_long )
54+ colmap [key_long ] = val
55+ return colmap
56+
4157def main (arglist = None ):
4258 """Command-line interface.
4359
@@ -138,22 +154,28 @@ def _main_list(args):
138154 show .list_files (text_items = args .text )
139155
140156def _main_igblast (args , extra_igblastn_args = None ):
157+ colmap = args_to_colmap (args )
141158 igblast .igblast (
142159 query_path = args .query ,
143160 ref_paths = args .reference ,
144161 db_path = args .database ,
145162 species = args .species ,
163+ fmt_in = args .input_format ,
164+ colmap = colmap ,
146165 extra_args = extra_igblastn_args ,
147166 dry_run = args .dry_run ,
148167 threads = args .threads )
149168
150169def _main_summarize (args ):
170+ colmap = args_to_colmap (args )
151171 summarize .summarize (
152172 ref_paths = args .reference ,
153173 query = args .query ,
154174 output = args .output ,
155175 showtxt = args .show ,
156176 species = args .species ,
177+ fmt_in = args .input_format ,
178+ colmap = colmap ,
157179 dry_run = args .dry_run )
158180
159181def _main_vdj_gather (args ):
@@ -163,24 +185,27 @@ def _main_vdj_gather(args):
163185 dry_run = args .dry_run )
164186
165187def _main_vdj_match (args ):
188+ colmap = args_to_colmap (args )
166189 vdj_match .vdj_match (
167190 ref_paths = args .reference ,
168191 query = args .query ,
169192 output = args .output ,
170193 showtxt = args .show ,
171194 species = args .species ,
195+ fmt_in = args .input_format ,
196+ colmap = colmap ,
172197 dry_run = args .dry_run )
173198
174- def _main_tab2seq (args ):
175- tab2seq . tab2seq (
176- tab_path_in = args . input ,
177- seq_path_out = args .output ,
178- seq_col = args .seq_col ,
179- seq_id_col = args .seq_id_col ,
180- seq_desc_col = args .seq_desc_col ,
181- qual_col = args . seq_qual_col ,
182- tab_fmt = args .tab_fmt ,
183- seq_fmt = args .seq_fmt )
199+ def _main_convert (args ):
200+ colmap = args_to_colmap ( args )
201+ convert . convert (
202+ path_in = args .input ,
203+ path_out = args .output ,
204+ fmt_in = args .input_format ,
205+ fmt_out = args .output_format ,
206+ colmap = colmap ,
207+ dummyqual = args .dummy_qual ,
208+ dry_run = args .dry_run )
184209
185210def _setup_log (verbose , quiet , prefix ):
186211 # Handle warnings via logging
@@ -240,9 +265,9 @@ def __setup_arg_parser():
240265 help = "Find closest-matching germline VDJ sequences" ,
241266 description = rewrap (vdj_match .__doc__ ),
242267 formatter_class = argparse .RawDescriptionHelpFormatter )
243- p_tab2seq = subps .add_parser ("tab2seq " ,
244- help = "Convert CSV/TSV to FASTA/FASTQ" ,
245- description = rewrap (tab2seq .__doc__ ),
268+ p_convert = subps .add_parser ("convert " ,
269+ help = "Convert FASTA/FASTQ/CSV/TSV " ,
270+ description = rewrap (convert .__doc__ ),
246271 formatter_class = argparse .RawDescriptionHelpFormatter )
247272 p_show = subps .add_parser ("show" ,
248273 help = "show file contents" ,
@@ -347,13 +372,20 @@ def __setup_arg_parser():
347372
348373 __add_common_args (p_igblast )
349374 p_igblast .add_argument ("-Q" , "--query" , required = True ,
350- help = "query FASTA " )
375+ help = "query input " )
351376 p_igblast .add_argument ("-r" , "--reference" , nargs = "+" ,
352377 help = "one or more FASTA/directory/builtin names pointing to V/D/J FASTA files" )
353378 p_igblast .add_argument ("-d" , "--database" ,
354379 help = "optional persistent database directory name (default: use temp directory)" )
355380 p_igblast .add_argument ("-S" , "--species" ,
356381 help = "species to use (human or rhesus). Default: infer from database if possible" )
382+ p_igblast .add_argument ("--input-format" ,
383+ help = "format of query input "
384+ "(default: detect from input filename if possible)" )
385+ p_igblast .add_argument ("--col-seq-id" ,
386+ help = "Name of column containing sequence IDs (for tabular query input)" )
387+ p_igblast .add_argument ("--col-seq" ,
388+ help = "Name of column containing sequences (for tabular query input)" )
357389 p_igblast .add_argument ("-t" , "--threads" , type = int , default = 1 ,
358390 help = "number of threads for parallel processing (default: 1)" )
359391 p_igblast .set_defaults (func = _main_igblast )
@@ -365,6 +397,13 @@ def __setup_arg_parser():
365397 help = "query FASTA" )
366398 p_summarize .add_argument ("-S" , "--species" ,
367399 help = "species to use (human or rhesus). Default: infer from database if possible" )
400+ p_summarize .add_argument ("--input-format" ,
401+ help = "format of query input "
402+ "(default: detect from input filename if possible)" )
403+ p_summarize .add_argument ("--col-seq-id" ,
404+ help = "Name of column containing sequence IDs (for tabular query input)" )
405+ p_summarize .add_argument ("--col-seq" ,
406+ help = "Name of column containing sequences (for tabular query input)" )
368407 p_summarize .add_argument ("-o" , "--output" ,
369408 help = "Output filename" )
370409 p_summarize .add_argument ("--show" , action = argparse .BooleanOptionalAction ,
@@ -386,34 +425,43 @@ def __setup_arg_parser():
386425 help = "query FASTA" )
387426 p_vdj_match .add_argument ("-S" , "--species" ,
388427 help = "species to use (human or rhesus). Default: infer from database if possible" )
428+ p_vdj_match .add_argument ("--input-format" ,
429+ help = "format of query input "
430+ "(default: detect from input filename if possible)" )
431+ p_vdj_match .add_argument ("--col-seq-id" ,
432+ help = "Name of column containing sequence IDs (for tabular query input)" )
433+ p_vdj_match .add_argument ("--col-seq" ,
434+ help = "Name of column containing sequences (for tabular query input)" )
389435 p_vdj_match .add_argument ("-o" , "--output" ,
390436 help = "Output filename" )
391437 p_vdj_match .add_argument ("--show" , action = argparse .BooleanOptionalAction ,
392438 help = "Explicitly enable/disable showing the results directly on standard output "
393439 "(default: disabled if using file output, enabled otherwise)" )
394440 p_vdj_match .set_defaults (func = _main_vdj_match )
395441
396- __add_common_args (p_tab2seq )
397- p_tab2seq .add_argument ("input" ,
398- help = "one CSV or TSV file path, or a literal '-' for standard input" )
399- p_tab2seq .add_argument ("output" ,
400- help = "one FASTA or FASTQ file path, or a literal '-' for standard output" )
401- p_tab2seq .add_argument ("--seq-col" , required = True ,
402- help = "name of table column containing sequences" )
403- p_tab2seq .add_argument ("--seq-id-col" , required = True ,
404- help = "name of table column containing sequence IDs" )
405- p_tab2seq .add_argument ("--seq-desc-col" ,
406- help = "name of table column containing sequence descriptions (optional)" )
407- p_tab2seq .add_argument ("--seq-qual-col" ,
408- help = "name of table column containing sequence quality "
409- "scores as PHRED+33 (for FASTQ output only)" )
410- p_tab2seq .add_argument ("--tab-fmt" ,
411- help = "Format of input: tsv or csv. "
412- "default is detected from input filename if possible" )
413- p_tab2seq .add_argument ("--seq-fmt" ,
414- help = "Format of output: fasta or fastq. "
415- "default is detected from output filename if possible" )
416- p_tab2seq .set_defaults (func = _main_tab2seq )
442+ __add_common_args (p_convert )
443+ p_convert .add_argument ("input" ,
444+ help = "input file path, or a literal '-' for standard input" )
445+ p_convert .add_argument ("output" ,
446+ help = "output file path, or a literal '-' for standard output" )
447+ p_convert .add_argument ("--input-format" ,
448+ help = "format of input "
449+ "(default: detect from input filename if possible)" )
450+ p_convert .add_argument ("--output-format" ,
451+ help = "format of output "
452+ "(default: detect from output filename if possible)" )
453+ p_convert .add_argument ("--col-seq-id" ,
454+ help = "Name of column containing sequence IDs (for tabular input/output)" )
455+ p_convert .add_argument ("--col-seq" ,
456+ help = "Name of column containing sequences (for tabular input/output)" )
457+ p_convert .add_argument ("--col-seq-qual" ,
458+ help = "Name of column containing sequence qualities (for tabular input/output)" )
459+ p_convert .add_argument ("--col-seq-desc" ,
460+ help = "Name of column containing sequence descriptions (for tabular input/output)" )
461+ p_convert .add_argument ("-d" , "--dummy-qual" ,
462+ help = "Quality score to use for all bases for applicable output types, "
463+ 'as text (e.g. use "I" for 40)' )
464+ p_convert .set_defaults (func = _main_convert )
417465
418466 return parser
419467
0 commit comments