Skip to content

Commit 7ecc5bb

Browse files
authored
Merge pull request #19 from PopovIILab/dev
Refactor package CLI to native Typer subcommands and update metadata to v1.1.0
2 parents 286e86f + 36adecd commit 7ecc5bb

14 files changed

Lines changed: 1063 additions & 825 deletions

README.md

Lines changed: 177 additions & 172 deletions
Large diffs are not rendered by default.

krakenparser/counts/convert2csv.py

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
11
#!/usr/bin/env python
2-
3-
import argparse
42
import logging
3+
import sys
54
from pathlib import Path
5+
from typing import Optional
66

77
import pandas as pd
8+
import typer
89

910
from krakenparser.utils import ensure_output_dir
1011

1112
_log = logging.getLogger(__name__)
1213

14+
app = typer.Typer(
15+
name="csv",
16+
add_completion=False,
17+
context_settings={"help_option_names": ["-h", "--help"]},
18+
)
19+
1320

14-
def convert_to_csv(input_file, output_file):
21+
def convert_to_csv(input_file: str, output_file: str) -> None:
1522
in_path = Path(input_file)
1623
if not in_path.is_file():
1724
raise FileNotFoundError(f"Input file not found: {in_path}")
@@ -22,26 +29,42 @@ def convert_to_csv(input_file, output_file):
2229
_log.info("Data converted and saved as '%s'.", output_file)
2330

2431

25-
def main() -> None:
26-
logging.basicConfig(level=logging.INFO, format="%(message)s")
27-
parser = argparse.ArgumentParser(
28-
description="Reads a TXT file, reorganizes the data, and converts it into a CSV file."
29-
)
30-
parser.add_argument(
32+
@app.callback(invoke_without_command=True)
33+
def main(
34+
ctx: typer.Context,
35+
input_file: Optional[str] = typer.Option(
36+
None,
3137
"-i",
3238
"--input",
33-
required=True,
3439
help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.",
35-
)
36-
parser.add_argument(
40+
),
41+
output_file: Optional[str] = typer.Option(
42+
None,
3743
"-o",
3844
"--output",
39-
required=True,
4045
help="Path to the output CSV file. The script will restructure the data and save it here.",
41-
)
42-
args = parser.parse_args()
43-
convert_to_csv(args.input, args.output)
46+
),
47+
) -> None:
48+
"""Reads a TXT file, reorganizes the data, and converts it into a CSV file."""
49+
logging.basicConfig(level=logging.INFO, format="%(message)s")
50+
51+
if input_file is None and output_file is None:
52+
print(ctx.get_help())
53+
raise typer.Exit()
54+
55+
if not input_file or not output_file:
56+
print(
57+
"Error: Missing required options '-i / --input' and '-o / --output'.",
58+
file=sys.stderr,
59+
)
60+
raise typer.Exit(code=1)
61+
62+
try:
63+
convert_to_csv(input_file, output_file)
64+
except FileNotFoundError as e:
65+
print(f"Error: {e}", file=sys.stderr)
66+
raise typer.Exit(code=1)
4467

4568

4669
if __name__ == "__main__":
47-
main()
70+
app()

krakenparser/counts/processing_script.py

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,24 @@
11
#!/usr/bin/env python
22

3-
import argparse
43
import logging
54
import os
5+
import sys
66
import tempfile
77
from pathlib import Path
8+
from typing import Optional
9+
10+
import typer
811

912
_log = logging.getLogger(__name__)
1013

14+
app = typer.Typer(
15+
name="process",
16+
add_completion=False,
17+
context_settings={"help_option_names": ["-h", "--help"]},
18+
)
19+
1120

12-
def modify_taxa_names(line):
21+
def modify_taxa_names(line: str) -> str:
1322
prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"]
1423
for prefix in prefixes:
1524
if line.startswith(prefix):
@@ -19,7 +28,7 @@ def modify_taxa_names(line):
1928
return line
2029

2130

22-
def process_files(source_file, destination_file):
31+
def process_files(source_file: str, destination_file: str) -> None:
2332
src_path = Path(source_file)
2433
if not src_path.is_file():
2534
raise FileNotFoundError(f"Source file not found: {src_path}")
@@ -53,26 +62,42 @@ def process_files(source_file, destination_file):
5362
_log.info(f"Processed {destination_file} successfully.")
5463

5564

56-
def main() -> None:
57-
logging.basicConfig(level=logging.INFO, format="%(message)s")
58-
parser = argparse.ArgumentParser(
59-
description="Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it."
60-
)
61-
parser.add_argument(
65+
@app.callback(invoke_without_command=True)
66+
def main(
67+
ctx: typer.Context,
68+
input_file: Optional[str] = typer.Option(
69+
None,
6270
"-i",
6371
"--input",
64-
required=True,
6572
help="Path to the source file. This file's first line will be read and modified.",
66-
)
67-
parser.add_argument(
73+
),
74+
output_file: Optional[str] = typer.Option(
75+
None,
6876
"-o",
6977
"--output",
70-
required=True,
7178
help="Path to the destination file. This file's contents will be updated with cleaned taxa names.",
72-
)
73-
args = parser.parse_args()
74-
process_files(args.input, args.output)
79+
),
80+
) -> None:
81+
"""Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it."""
82+
logging.basicConfig(level=logging.INFO, format="%(message)s")
83+
84+
if input_file is None and output_file is None:
85+
print(ctx.get_help())
86+
raise typer.Exit()
87+
88+
if not input_file or not output_file:
89+
print(
90+
"Error: Missing required options '-i / --input' and '-o / --output'.",
91+
file=sys.stderr,
92+
)
93+
raise typer.Exit(code=1)
94+
95+
try:
96+
process_files(input_file, output_file)
97+
except FileNotFoundError as e:
98+
print(f"Error: {e}", file=sys.stderr)
99+
raise typer.Exit(code=1)
75100

76101

77102
if __name__ == "__main__":
78-
main()
103+
app()

krakenparser/counts/split_mpa.py

Lines changed: 113 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,23 @@
44
Replaces decombine.sh and decombine_viruses.sh.
55
"""
66

7-
import argparse
87
import logging
98
import re
9+
import sys
1010
from pathlib import Path
11+
from typing import Optional
12+
13+
import typer
1114

1215
from krakenparser.utils import ensure_output_dir
1316

1417
_log = logging.getLogger(__name__)
1518

19+
app = typer.Typer(
20+
name="split",
21+
add_completion=False,
22+
context_settings={"help_option_names": ["-h", "--help"]},
23+
)
1624

1725
_RANKS = [
1826
("species", "s__", []),
@@ -23,20 +31,21 @@
2331
("phylum", "p__", ["s__", "g__", "f__", "o__", "c__"]),
2432
]
2533

26-
_HUMAN_TAXA = {
27-
"species": "s__Homo_sapiens",
28-
"genus": "g__Homo",
29-
"family": "f__Hominidae",
30-
"order": "o__Primates",
31-
"class": "c__Mammalia",
32-
"phylum": "p__Chordata",
33-
}
34+
_HUMAN_MARKERS = frozenset(
35+
[
36+
"s__Homo_sapiens",
37+
"g__Homo",
38+
"f__Hominidae",
39+
"o__Primates",
40+
"c__Mammalia",
41+
"p__Chordata",
42+
]
43+
)
3444

3545
_ACCESSION_RE = re.compile(r"(SRS|SRR|SRX|ERS|ERR|ERX|DRS|DRR|DRX)\d*-")
3646

3747

3848
def _strip_path_prefix(line: str) -> str:
39-
"""'d__X|p__Y|s__Z\t10\t20' → 's__Z\t10\t20'"""
4049
tab = line.find("\t")
4150
if tab == -1:
4251
return line
@@ -46,10 +55,20 @@ def _strip_path_prefix(line: str) -> str:
4655
return _ACCESSION_RE.sub("", segment + rest)
4756

4857

58+
def _human_in_line(line: str) -> bool:
59+
tab = line.find("\t")
60+
path = line[:tab] if tab != -1 else line
61+
segments = set(path.split("|"))
62+
return bool(segments & _HUMAN_MARKERS)
63+
64+
4965
def split_mpa(
5066
input_file: str,
5167
output_dir: str,
5268
viruses_only: bool = False,
69+
bacteria_only: bool = False,
70+
fungi_only: bool = False,
71+
archaea_only: bool = False,
5372
keep_human: bool = False,
5473
) -> None:
5574
in_path = Path(input_file)
@@ -58,17 +77,28 @@ def split_mpa(
5877
out_path = ensure_output_dir(output_dir, is_file=False)
5978
(out_path / "txt").mkdir(exist_ok=True)
6079

61-
lines = in_path.read_text().splitlines()
62-
data_lines = [ln for ln in lines if not ln.startswith("#") and ln.strip()]
80+
all_lines = [
81+
ln
82+
for ln in in_path.read_text().splitlines()
83+
if not ln.startswith("#") and ln.strip()
84+
]
6385

86+
data_lines = all_lines.copy()
6487
if viruses_only:
6588
data_lines = [ln for ln in data_lines if "d__Viruses" in ln]
89+
if bacteria_only:
90+
data_lines = [ln for ln in data_lines if "d__Bacteria" in ln]
91+
if fungi_only:
92+
data_lines = [ln for ln in data_lines if "k__Fungi" in ln]
93+
if archaea_only:
94+
data_lines = [ln for ln in data_lines if "d__Archaea" in ln]
6695

67-
filter_human = not keep_human and not viruses_only
96+
if keep_human:
97+
human_lines = [ln for ln in all_lines if _human_in_line(ln)]
98+
data_lines = list(dict.fromkeys(data_lines + human_lines))
6899

69100
for rank_name, rank_prefix, exclude_prefixes in _RANKS:
70101
result = []
71-
human_pattern = _HUMAN_TAXA[rank_name]
72102

73103
for line in data_lines:
74104
if rank_prefix not in line:
@@ -77,7 +107,7 @@ def split_mpa(
77107
continue
78108
if any(ep in line for ep in exclude_prefixes):
79109
continue
80-
if filter_human and human_pattern in line:
110+
if not keep_human and _human_in_line(line):
81111
continue
82112
result.append(_strip_path_prefix(line))
83113

@@ -87,33 +117,75 @@ def split_mpa(
87117
_log.info("MPA file split successfully. Output stored in %s", output_dir)
88118

89119

90-
def main() -> None:
91-
logging.basicConfig(level=logging.INFO, format="%(message)s")
92-
parser = argparse.ArgumentParser(
93-
description="Split a combined MPA table into per-rank TXT files."
94-
)
95-
parser.add_argument("-i", "--input", required=True, help="Input combined MPA file")
96-
parser.add_argument("-o", "--output", required=True, help="Output directory")
97-
parser.add_argument(
98-
"--viruses-only",
99-
action="store_true",
100-
default=False,
101-
help="Extract only Viruses domain taxa",
102-
)
103-
parser.add_argument(
120+
@app.callback(invoke_without_command=True)
121+
def main(
122+
ctx: typer.Context,
123+
input_file: Optional[str] = typer.Option(
124+
None,
125+
"-i",
126+
"--input",
127+
help="Input combined MPA file.",
128+
),
129+
output_dir: Optional[str] = typer.Option(
130+
None,
131+
"-o",
132+
"--output",
133+
help="Output directory.",
134+
),
135+
viruses_only: bool = typer.Option(
136+
False,
137+
"--viruses",
138+
help="Extract only VIRUSES domain taxa.",
139+
),
140+
bacteria_only: bool = typer.Option(
141+
False,
142+
"--bacteria",
143+
help="Extract only BACTERIA domain taxa.",
144+
),
145+
fungi_only: bool = typer.Option(
146+
False,
147+
"--fungi",
148+
help="Extract only FUNGI kingdom taxa.",
149+
),
150+
archaea_only: bool = typer.Option(
151+
False,
152+
"--archaea",
153+
help="Extract only ARCHAEA domain taxa.",
154+
),
155+
keep_human: bool = typer.Option(
156+
False,
104157
"--keep-human",
105-
action="store_true",
106-
default=False,
107-
help="Do not filter human-related taxa (default: filtered)",
108-
)
109-
args = parser.parse_args()
110-
split_mpa(
111-
args.input,
112-
args.output,
113-
viruses_only=args.viruses_only,
114-
keep_human=args.keep_human,
115-
)
158+
help="Retain human-related taxa (default: filtered out).",
159+
),
160+
) -> None:
161+
"""Split a combined MPA table into per-rank TXT files."""
162+
logging.basicConfig(level=logging.INFO, format="%(message)s")
163+
164+
if input_file is None and output_dir is None:
165+
print(ctx.get_help())
166+
raise typer.Exit()
167+
168+
if not input_file or not output_dir:
169+
print(
170+
"Error: Missing required options '-i / --input' and '-o / --output'.",
171+
file=sys.stderr,
172+
)
173+
raise typer.Exit(code=1)
174+
175+
try:
176+
split_mpa(
177+
input_file,
178+
output_dir,
179+
viruses_only=viruses_only,
180+
bacteria_only=bacteria_only,
181+
fungi_only=fungi_only,
182+
archaea_only=archaea_only,
183+
keep_human=keep_human,
184+
)
185+
except FileNotFoundError as e:
186+
print(f"Error: {e}", file=sys.stderr)
187+
raise typer.Exit(code=1)
116188

117189

118190
if __name__ == "__main__":
119-
main()
191+
app()

0 commit comments

Comments
 (0)