Skip to content

Commit ea96c91

Browse files
authored
Merge pull request #22 from ShawHahnLab/release-0.2.0
Version 0.2.0
2 parents 3e121fe + 41972c4 commit ea96c91

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+3138
-260
lines changed

CHANGELOG.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,24 @@
11
# Changelog
22

3+
## 0.2.0 - 2022-02-15
4+
5+
### Added
6+
7+
* Human germline FASTAs from IMGT ([#21])
8+
* support for FASTA/FASTQ/CSV/TSV query inputs for the igblast and related
9+
commands ([#18], [#19])
10+
* convert command for FASTA/FASTQ/CSV/TSV file conversion, in place of the
11+
more limited tab2seq command ([#14], [#16])
12+
* Rhesus germline HV and HJ allele FASTAs from
13+
[10.4049/jimmunol.1800342](https://doi.org/10.4049/jimmunol.1800342) ([#13])
14+
15+
[#21]: https://github.com/ShawHahnLab/igseq/pull/21
16+
[#19]: https://github.com/ShawHahnLab/igseq/pull/19
17+
[#18]: https://github.com/ShawHahnLab/igseq/pull/18
18+
[#16]: https://github.com/ShawHahnLab/igseq/pull/16
19+
[#14]: https://github.com/ShawHahnLab/igseq/pull/14
20+
[#13]: https://github.com/ShawHahnLab/igseq/pull/13
21+
322
## 0.1.1 - 2021-12-07
423

524
### Changed

conda/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html
2-
{% set version = "0.1.1" %}
2+
{% set version = "0.2.0" %}
33
{% set build = "0" %}
44

55
package:

igseq/__main__.py

Lines changed: 84 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from . import summarize
1717
from . import vdj_gather
1818
from . import vdj_match
19-
from . import tab2seq
19+
from . import convert
2020
from . import show
2121
from .util import IgSeqError
2222
from .version import __version__
@@ -38,6 +38,22 @@ def wrap(txt):
3838
chunks = txt.strip().split("\n\n")
3939
return "\n\n".join([wrap(chunk) for chunk in chunks])
4040

41+
def args_to_colmap(args):
42+
"""Make dictionary of column name mappings from cmd-line arguments.
43+
44+
This is used for commands that work with tabular inputs/outputs.
45+
"""
46+
# convert arguments like "col_seq_id" to "sequence_id"
47+
colmap = {}
48+
longer = {"desc": "description", "seq": "sequence", "qual": "quality"}
49+
for key, val in vars(args).items():
50+
if key.startswith("col") and val is not None:
51+
key_long = key.split("_")[1:]
52+
key_long = [longer.get(word, word) for word in key_long]
53+
key_long = "_".join(key_long)
54+
colmap[key_long] = val
55+
return colmap
56+
4157
def main(arglist=None):
4258
"""Command-line interface.
4359
@@ -138,22 +154,28 @@ def _main_list(args):
138154
show.list_files(text_items=args.text)
139155

140156
def _main_igblast(args, extra_igblastn_args=None):
157+
colmap = args_to_colmap(args)
141158
igblast.igblast(
142159
query_path=args.query,
143160
ref_paths=args.reference,
144161
db_path=args.database,
145162
species=args.species,
163+
fmt_in=args.input_format,
164+
colmap=colmap,
146165
extra_args=extra_igblastn_args,
147166
dry_run=args.dry_run,
148167
threads=args.threads)
149168

150169
def _main_summarize(args):
170+
colmap = args_to_colmap(args)
151171
summarize.summarize(
152172
ref_paths=args.reference,
153173
query=args.query,
154174
output=args.output,
155175
showtxt=args.show,
156176
species=args.species,
177+
fmt_in=args.input_format,
178+
colmap=colmap,
157179
dry_run=args.dry_run)
158180

159181
def _main_vdj_gather(args):
@@ -163,24 +185,27 @@ def _main_vdj_gather(args):
163185
dry_run=args.dry_run)
164186

165187
def _main_vdj_match(args):
188+
colmap = args_to_colmap(args)
166189
vdj_match.vdj_match(
167190
ref_paths=args.reference,
168191
query=args.query,
169192
output=args.output,
170193
showtxt=args.show,
171194
species=args.species,
195+
fmt_in=args.input_format,
196+
colmap=colmap,
172197
dry_run=args.dry_run)
173198

174-
def _main_tab2seq(args):
175-
tab2seq.tab2seq(
176-
tab_path_in=args.input,
177-
seq_path_out=args.output,
178-
seq_col=args.seq_col,
179-
seq_id_col=args.seq_id_col,
180-
seq_desc_col=args.seq_desc_col,
181-
qual_col=args.seq_qual_col,
182-
tab_fmt=args.tab_fmt,
183-
seq_fmt=args.seq_fmt)
199+
def _main_convert(args):
200+
colmap = args_to_colmap(args)
201+
convert.convert(
202+
path_in=args.input,
203+
path_out=args.output,
204+
fmt_in=args.input_format,
205+
fmt_out=args.output_format,
206+
colmap=colmap,
207+
dummyqual=args.dummy_qual,
208+
dry_run=args.dry_run)
184209

185210
def _setup_log(verbose, quiet, prefix):
186211
# Handle warnings via logging
@@ -240,9 +265,9 @@ def __setup_arg_parser():
240265
help="Find closest-matching germline VDJ sequences",
241266
description=rewrap(vdj_match.__doc__),
242267
formatter_class=argparse.RawDescriptionHelpFormatter)
243-
p_tab2seq = subps.add_parser("tab2seq",
244-
help="Convert CSV/TSV to FASTA/FASTQ",
245-
description=rewrap(tab2seq.__doc__),
268+
p_convert = subps.add_parser("convert",
269+
help="Convert FASTA/FASTQ/CSV/TSV",
270+
description=rewrap(convert.__doc__),
246271
formatter_class=argparse.RawDescriptionHelpFormatter)
247272
p_show = subps.add_parser("show",
248273
help="show file contents",
@@ -347,13 +372,20 @@ def __setup_arg_parser():
347372

348373
__add_common_args(p_igblast)
349374
p_igblast.add_argument("-Q", "--query", required=True,
350-
help="query FASTA")
375+
help="query input")
351376
p_igblast.add_argument("-r", "--reference", nargs="+",
352377
help="one or more FASTA/directory/builtin names pointing to V/D/J FASTA files")
353378
p_igblast.add_argument("-d", "--database",
354379
help="optional persistent database directory name (default: use temp directory)")
355380
p_igblast.add_argument("-S", "--species",
356381
help="species to use (human or rhesus). Default: infer from database if possible")
382+
p_igblast.add_argument("--input-format",
383+
help="format of query input "
384+
"(default: detect from input filename if possible)")
385+
p_igblast.add_argument("--col-seq-id",
386+
help="Name of column containing sequence IDs (for tabular query input)")
387+
p_igblast.add_argument("--col-seq",
388+
help="Name of column containing sequences (for tabular query input)")
357389
p_igblast.add_argument("-t", "--threads", type=int, default=1,
358390
help="number of threads for parallel processing (default: 1)")
359391
p_igblast.set_defaults(func=_main_igblast)
@@ -365,6 +397,13 @@ def __setup_arg_parser():
365397
help="query FASTA")
366398
p_summarize.add_argument("-S", "--species",
367399
help="species to use (human or rhesus). Default: infer from database if possible")
400+
p_summarize.add_argument("--input-format",
401+
help="format of query input "
402+
"(default: detect from input filename if possible)")
403+
p_summarize.add_argument("--col-seq-id",
404+
help="Name of column containing sequence IDs (for tabular query input)")
405+
p_summarize.add_argument("--col-seq",
406+
help="Name of column containing sequences (for tabular query input)")
368407
p_summarize.add_argument("-o", "--output",
369408
help="Output filename")
370409
p_summarize.add_argument("--show", action=argparse.BooleanOptionalAction,
@@ -386,34 +425,43 @@ def __setup_arg_parser():
386425
help="query FASTA")
387426
p_vdj_match.add_argument("-S", "--species",
388427
help="species to use (human or rhesus). Default: infer from database if possible")
428+
p_vdj_match.add_argument("--input-format",
429+
help="format of query input "
430+
"(default: detect from input filename if possible)")
431+
p_vdj_match.add_argument("--col-seq-id",
432+
help="Name of column containing sequence IDs (for tabular query input)")
433+
p_vdj_match.add_argument("--col-seq",
434+
help="Name of column containing sequences (for tabular query input)")
389435
p_vdj_match.add_argument("-o", "--output",
390436
help="Output filename")
391437
p_vdj_match.add_argument("--show", action=argparse.BooleanOptionalAction,
392438
help="Explicitly enable/disable showing the results directly on standard output "
393439
"(default: disabled if using file output, enabled otherwise)")
394440
p_vdj_match.set_defaults(func=_main_vdj_match)
395441

396-
__add_common_args(p_tab2seq)
397-
p_tab2seq.add_argument("input",
398-
help="one CSV or TSV file path, or a literal '-' for standard input")
399-
p_tab2seq.add_argument("output",
400-
help="one FASTA or FASTQ file path, or a literal '-' for standard output")
401-
p_tab2seq.add_argument("--seq-col", required=True,
402-
help="name of table column containing sequences")
403-
p_tab2seq.add_argument("--seq-id-col", required=True,
404-
help="name of table column containing sequence IDs")
405-
p_tab2seq.add_argument("--seq-desc-col",
406-
help="name of table column containing sequence descriptions (optional)")
407-
p_tab2seq.add_argument("--seq-qual-col",
408-
help="name of table column containing sequence quality "
409-
"scores as PHRED+33 (for FASTQ output only)")
410-
p_tab2seq.add_argument("--tab-fmt",
411-
help="Format of input: tsv or csv. "
412-
"default is detected from input filename if possible")
413-
p_tab2seq.add_argument("--seq-fmt",
414-
help="Format of output: fasta or fastq. "
415-
"default is detected from output filename if possible")
416-
p_tab2seq.set_defaults(func=_main_tab2seq)
442+
__add_common_args(p_convert)
443+
p_convert.add_argument("input",
444+
help="input file path, or a literal '-' for standard input")
445+
p_convert.add_argument("output",
446+
help="output file path, or a literal '-' for standard output")
447+
p_convert.add_argument("--input-format",
448+
help="format of input "
449+
"(default: detect from input filename if possible)")
450+
p_convert.add_argument("--output-format",
451+
help="format of output "
452+
"(default: detect from output filename if possible)")
453+
p_convert.add_argument("--col-seq-id",
454+
help="Name of column containing sequence IDs (for tabular input/output)")
455+
p_convert.add_argument("--col-seq",
456+
help="Name of column containing sequences (for tabular input/output)")
457+
p_convert.add_argument("--col-seq-qual",
458+
help="Name of column containing sequence qualities (for tabular input/output)")
459+
p_convert.add_argument("--col-seq-desc",
460+
help="Name of column containing sequence descriptions (for tabular input/output)")
461+
p_convert.add_argument("-d", "--dummy-qual",
462+
help="Quality score to use for all bases for applicable output types, "
463+
'as text (e.g. use "I" for 40)')
464+
p_convert.set_defaults(func=_main_convert)
417465

418466
return parser
419467

igseq/convert.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
"""
2+
Convert between various sequence and tabular file formats.
3+
4+
Input and output formats are by default inferred from filenames but can be
5+
given explicitly if needed. The formats are:
6+
7+
fa: FASTA
8+
fagz: gzipped FASTA
9+
fq: FASTQ
10+
fqgz: gzipped FASTQ
11+
csv: comma-separated values
12+
csvgz: gzipped comma-separated values
13+
tsv: tab-separated values
14+
tsvgz: gzipped tab-separated values
15+
"""
16+
17+
from .record import RecordReader, RecordWriter
18+
19+
def convert(path_in, path_out, fmt_in=None, fmt_out=None, colmap=None, dummyqual=None, dry_run=False):
20+
with RecordReader(path_in, fmt_in, colmap, dry_run=dry_run) as reader, \
21+
RecordWriter(path_out, fmt_out, colmap, dummyqual=dummyqual, dry_run=dry_run) as writer:
22+
for record in reader:
23+
writer.write(record)

igseq/data/examples/convert.sh

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
3+
[ -v EXAMPLES ] || EXAMPLES=$(python -c 'import igseq.util; print(igseq.util.DATA)')/examples
4+
5+
# converting FASTA to FASTA just unwraps it
6+
igseq convert $EXAMPLES/inputs/convert/wrapped.fasta unwrapped.fasta
7+
8+
# or, convert to CSV/TSV
9+
igseq convert $EXAMPLES/inputs/convert/wrapped.fasta unwrapped.csv
10+
11+
# or .fastq.gz to .fasta
12+
igseq convert $EXAMPLES/inputs/convert/unwrapped.fastq.gz unwrapped2.fasta
13+
14+
# a - can be used for stdin/stdout, but the format has to be given explicitly:
15+
igseq convert --input-format fa --output-format fa - - < $EXAMPLES/inputs/convert/wrapped.fasta > unwrapped3.fasta
16+
17+
# other table formats can be converted to FASTA or FASTQ if the column names to
18+
# use are specified. the default would find sequence_id and sequence columns
19+
# from AIRR:
20+
igseq convert $EXAMPLES/inputs/convert/airr.tsv from_airr.fasta
21+
# or maybe we want the junctions instead:
22+
igseq convert --col-seq junction $EXAMPLES/inputs/convert/airr.tsv from_airr_junctions.fasta

igseq/data/examples/igblast.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
# An arbitrary antibody sequence pulled from one of our datasets that looks
66
# complete and in-frame
77
QUERY=$EXAMPLES/inputs/igblast/query.fasta
8+
# A .fastq.gz version, to show off flexibility in query formats
9+
QUERY_FQGZ=$EXAMPLES/inputs/igblast/query.fastq.gz
10+
QUERY_CSV=$EXAMPLES/inputs/igblast/query.csv
811

912
# using the built-in Rhesus germline reference from IMGT and using the default
1013
# text output
@@ -30,3 +33,12 @@ igseq igblast -r rhesus -Q $QUERY -outfmt 19 | cut -f 10,62
3033
# The -num_alignments_V argument clashes with iseq's -n, so we need to use --
3134
# to clarify. igseq will remove the extra - when calling igblastn.
3235
igseq igblast -r rhesus -Q $QUERY -outfmt 7 --num_alignments_V 5
36+
37+
# like the first example, except giving fastq.gz as the query. It'll
38+
# automatically be converted to FASTA while being passed to the igblastn
39+
# command.
40+
igseq igblast -r rhesus/imgt -Q $QUERY_FQGZ
41+
42+
# or using tabular (CSV/TSV) input, and specifying which columns have the IDs
43+
# and sequences
44+
igseq igblast -r rhesus/imgt -Q $QUERY_CSV --col-seq-id SeqID --col-seq Seq -outfmt 19 -out igblast2.tsv

0 commit comments

Comments
 (0)