Skip to content

Commit 6b479be

Browse files
authored
Merge pull request #23 from PopovIILab/dev
refactor(tests,docs): polish docstrings, type annotations, and linting rules
2 parents a1ee760 + bcbe409 commit 6b479be

24 files changed

Lines changed: 1750 additions & 941 deletions

krakenparser/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from .kpplot.stackedbar import stacked_barplot
33
from .kpplot.streamgraph import streamgraph
44

5-
__all__ = [
5+
__all__: list[str] = [
66
"stacked_barplot",
77
"streamgraph",
88
"clustermap",

krakenparser/counts/convert2csv.py

Lines changed: 42 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
2+
"""Matrix manipulation utility for restructuring metagenomic abundance tables.
3+
4+
This module converts tab-delimited abundance tables (traditionally structured
5+
with features/taxa as rows and samples as columns) into standardized,
6+
transposed CSV sheets conforming to the tidy data format (samples as rows).
7+
"""
8+
29
import logging
3-
import sys
410
from pathlib import Path
511
from typing import Optional
612

@@ -9,40 +15,59 @@
915

1016
from krakenparser.utils import ensure_output_dir
1117

12-
_log = logging.getLogger(__name__)
18+
# Initialize module-level isolated logger
19+
_log: logging.Logger = logging.getLogger(__name__)
1320

14-
app = typer.Typer(
21+
# Dedicated Typer routing application instantiation
22+
app: typer.Typer = typer.Typer(
1523
name="csv",
1624
add_completion=False,
1725
context_settings={"help_option_names": ["-h", "--help"]},
1826
)
1927

2028

21-
def convert_to_csv(input_file: str, output_file: str) -> None:
22-
in_path = Path(input_file)
23-
if not in_path.is_file():
24-
raise FileNotFoundError(f"Input file not found: {in_path}")
25-
out_path = ensure_output_dir(output_file, is_file=True)
29+
def convert_to_csv(input_file: Path, output_file: Path) -> None:
30+
"""Transpose a tab-separated matrix and export it as a sample-centric CSV.
31+
32+
Reads a matrix where columns represent samples and rows represent taxa,
33+
performs an algebraic transposition operation (.T), and locks the new row
34+
index under the canonical 'Sample_id' header label.
35+
36+
Args:
37+
input_file: Path to the validated incoming tab-separated matrix file.
38+
output_file: Target path where the restructured CSV matrix will be dumped.
2639
27-
data = pd.read_csv(in_path, sep="\t", index_col=0)
40+
Raises:
41+
FileNotFoundError: Triggered if the specified input text resource is missing.
42+
"""
43+
if not input_file.is_file():
44+
raise FileNotFoundError(f"Input file not found: {input_file}")
45+
46+
out_path: Path = ensure_output_dir(output_file, is_file=True)
47+
48+
# Load high-dimensional matrix (Rows: Taxa, Columns: Samples)
49+
data: pd.DataFrame = pd.read_csv(input_file, sep="\t", index_col=0)
50+
51+
# Execute matrix transposition to shift samples to rows (Tidy Data layout)
2852
data.T.to_csv(out_path, index_label="Sample_id")
29-
_log.info("Data converted and saved as '%s'.", output_file)
53+
54+
_log.info("Data successfully transposed and saved to '%s'.", output_file)
3055

3156

3257
@app.callback(invoke_without_command=True)
3358
def main(
3459
ctx: typer.Context,
35-
input_file: Optional[str] = typer.Option(
60+
input_file: Optional[Path] = typer.Option(
3661
None,
3762
"-i",
3863
"--input",
39-
help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.",
64+
help="Path to the input tab-delimited TXT file (samples in columns, taxa in rows).",
4065
),
41-
output_file: Optional[str] = typer.Option(
66+
output_file: Optional[Path] = typer.Option(
4267
None,
4368
"-o",
4469
"--output",
45-
help="Path to the output CSV file. The script will restructure the data and save it here.",
70+
help="Path to the output transposed CSV file.",
4671
),
4772
) -> None:
4873
"""Reads a TXT file, reorganizes the data, and converts it into a CSV file."""
@@ -53,16 +78,13 @@ def main(
5378
raise typer.Exit()
5479

5580
if not input_file or not output_file:
56-
print(
57-
"Error: Missing required options '-i / --input' and '-o / --output'.",
58-
file=sys.stderr,
59-
)
81+
print("Error: Missing required options '-i / --input' and '-o / --output'.")
6082
raise typer.Exit(code=1)
6183

6284
try:
6385
convert_to_csv(input_file, output_file)
6486
except FileNotFoundError as e:
65-
print(f"Error: {e}", file=sys.stderr)
87+
print(f"Error: {e}")
6688
raise typer.Exit(code=1)
6789

6890

krakenparser/counts/processing_script.py

Lines changed: 72 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
2+
"""Post-processing matrix utility for metadata refinement and taxonomic sanitization.
3+
4+
This module cleans upstream pipeline artifacts by removing technical file extensions
5+
from sample headers and restoring canonical spaces to underscore-separated taxonomic
6+
nomenclature strings (e.g., converting 's__Escherichia_coli' to 'Escherichia coli').
7+
File mutations are executed via atomic filesystem transactions.
8+
"""
29

310
import logging
411
import os
@@ -9,73 +16,107 @@
916

1017
import typer
1118

12-
_log = logging.getLogger(__name__)
19+
# Initialize module-level isolated logger
20+
_log: logging.Logger = logging.getLogger(__name__)
1321

14-
app = typer.Typer(
22+
# Dedicated Typer routing application instantiation
23+
app: typer.Typer = typer.Typer(
1524
name="process",
1625
add_completion=False,
1726
context_settings={"help_option_names": ["-h", "--help"]},
1827
)
1928

2029

2130
def modify_taxa_names(line: str) -> str:
22-
prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"]
31+
"""Sanitize taxonomic names by replacing internal underscores with spaces.
32+
33+
Scans the line for standard taxonomic rank prefixes (s__, g__, etc.). If found,
34+
the primary taxon descriptor string is decoupled, sanitized of internal
35+
technical underscores, and reconstructed while preserving tailing tab metrics.
36+
37+
Args:
38+
line: A raw text row from the matrix containing taxonomic descriptors.
39+
40+
Returns:
41+
str: The structurally preserved string with restored space characters.
42+
"""
43+
prefixes: list[str] = ["s__", "g__", "f__", "o__", "c__", "p__"]
2344
for prefix in prefixes:
2445
if line.startswith(prefix):
25-
parts = line[len(prefix) :].split("\t")
46+
# Clean string parsing utilizing standard tab separation matrices
47+
parts: list[str] = line.removeprefix(prefix).split("\t")
2648
parts[0] = parts[0].replace("_", " ")
2749
return "\t".join(parts)
2850
return line
2951

3052

31-
def process_files(source_file: str, destination_file: str) -> None:
32-
src_path = Path(source_file)
33-
if not src_path.is_file():
34-
raise FileNotFoundError(f"Source file not found: {src_path}")
35-
dest_path = Path(destination_file)
36-
if not dest_path.is_file():
37-
raise FileNotFoundError(f"Destination file not found: {dest_path}")
53+
def process_files(source_file: Path, destination_file: Path) -> None:
54+
"""Synchronize matrix headers and sanitize taxonomic profiles atomically.
55+
56+
Extracts clean cohort descriptors from the header of a source tracker,
57+
applies string cleaning to a targeted taxonomy mapping spreadsheet,
58+
and updates the destination file utilizing atomic replacement blocks.
3859
39-
# Read the first line from the source file and modify it
40-
with open(src_path, "r") as file:
41-
first_line_source = file.readline()
42-
modified_first_line = "\t".join(
60+
Args:
61+
source_file: Validated Path to the template matrix containing pristine headers.
62+
destination_file: Target Path to the file undergoing line-by-line taxonomy cleaning.
63+
64+
Raises:
65+
FileNotFoundError: Triggered if either the source or destination targets are absent.
66+
"""
67+
if not source_file.is_file():
68+
raise FileNotFoundError(f"Source file not found: {source_file}")
69+
if not destination_file.is_file():
70+
raise FileNotFoundError(f"Destination file not found: {destination_file}")
71+
72+
# Step 1: Read and truncate raw pipeline suffixes from sample headers
73+
with open(source_file, "r", encoding="utf-8") as file:
74+
first_line_source: str = file.readline()
75+
76+
modified_first_line: str = "\t".join(
4377
word.split(".")[0] for word in first_line_source.split()
4478
)
4579

46-
# Read all content from the destination file and modify taxa names
47-
with open(dest_path, "r") as file:
48-
lines = file.readlines()
49-
modified_lines = [modify_taxa_names(line.strip()) for line in lines]
80+
# Step 2: Read targets and map taxonomic updates lazily across lists
81+
with open(destination_file, "r", encoding="utf-8") as file:
82+
lines: list[str] = file.readlines()
83+
84+
modified_lines: list[str] = [modify_taxa_names(line.strip()) for line in lines]
5085

51-
# Combine the modified first line with the modified content of the destination file
52-
updated_content = modified_first_line + "\n" + "\n".join(modified_lines)
86+
# Step 3: Integrate matrices and commit layout modifications to disk
87+
joined_lines: str = "\n".join(modified_lines)
88+
updated_content: str = f"{modified_first_line}\n{joined_lines}"
5389

54-
# Write atomically: write to a temp file in the same directory, then replace
90+
# Secure atomic writer operations targeting adjacent scratch space regions
5591
with tempfile.NamedTemporaryFile(
56-
mode="w", dir=dest_path.parent, delete=False, suffix=".tmp"
92+
mode="w",
93+
dir=destination_file.parent,
94+
delete=False,
95+
suffix=".tmp",
96+
encoding="utf-8",
5797
) as tmp:
5898
tmp.write(updated_content)
59-
tmp_path = tmp.name
60-
os.replace(tmp_path, dest_path)
99+
tmp_path: str = tmp.name
61100

62-
_log.info(f"Processed {destination_file} successfully.")
101+
# Commit transactions atomically across POSIX virtual environments
102+
os.replace(tmp_path, destination_file)
103+
_log.info("Processed '%s' successfully.", destination_file)
63104

64105

65106
@app.callback(invoke_without_command=True)
66107
def main(
67108
ctx: typer.Context,
68-
input_file: Optional[str] = typer.Option(
109+
input_file: Optional[Path] = typer.Option(
69110
None,
70111
"-i",
71112
"--input",
72-
help="Path to the source file. This file's first line will be read and modified.",
113+
help="Path to the source file (used to extract and truncate header labels).",
73114
),
74-
output_file: Optional[str] = typer.Option(
115+
output_file: Optional[Path] = typer.Option(
75116
None,
76117
"-o",
77118
"--output",
78-
help="Path to the destination file. This file's contents will be updated with cleaned taxa names.",
119+
help="Path to the destination matrix undergoing taxonomic name sanitation.",
79120
),
80121
) -> None:
81122
"""Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it."""

0 commit comments

Comments
 (0)