|
1 | | -#!/usr/bin/env python |
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Post-processing matrix utility for metadata refinement and taxonomic sanitization. |
| 3 | +
|
| 4 | +This module cleans upstream pipeline artifacts by removing technical file extensions |
| 5 | +from sample headers and restoring canonical spaces to underscore-separated taxonomic |
| 6 | +nomenclature strings (e.g., converting 's__Escherichia_coli' to 'Escherichia coli'). |
| 7 | +File mutations are executed via atomic filesystem transactions. |
| 8 | +""" |
2 | 9 |
|
3 | 10 | import logging |
4 | 11 | import os |
|
9 | 16 |
|
10 | 17 | import typer |
11 | 18 |
|
12 | | -_log = logging.getLogger(__name__) |
| 19 | +# Initialize module-level isolated logger |
| 20 | +_log: logging.Logger = logging.getLogger(__name__) |
13 | 21 |
|
14 | | -app = typer.Typer( |
| 22 | +# Dedicated Typer routing application instantiation |
| 23 | +app: typer.Typer = typer.Typer( |
15 | 24 | name="process", |
16 | 25 | add_completion=False, |
17 | 26 | context_settings={"help_option_names": ["-h", "--help"]}, |
18 | 27 | ) |
19 | 28 |
|
20 | 29 |
|
21 | 30 | def modify_taxa_names(line: str) -> str: |
22 | | - prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"] |
| 31 | + """Sanitize taxonomic names by replacing internal underscores with spaces. |
| 32 | +
|
| 33 | + Scans the line for standard taxonomic rank prefixes (s__, g__, etc.). If found, |
| 34 | + the primary taxon descriptor string is decoupled, sanitized of internal |
| 35 | + technical underscores, and reconstructed while preserving tailing tab metrics. |
| 36 | +
|
| 37 | + Args: |
| 38 | + line: A raw text row from the matrix containing taxonomic descriptors. |
| 39 | +
|
| 40 | + Returns: |
| 41 | + str: The structurally preserved string with restored space characters. |
| 42 | + """ |
| 43 | + prefixes: list[str] = ["s__", "g__", "f__", "o__", "c__", "p__"] |
23 | 44 | for prefix in prefixes: |
24 | 45 | if line.startswith(prefix): |
25 | | - parts = line[len(prefix) :].split("\t") |
| 46 | + # Clean string parsing utilizing standard tab separation matrices |
| 47 | + parts: list[str] = line.removeprefix(prefix).split("\t") |
26 | 48 | parts[0] = parts[0].replace("_", " ") |
27 | 49 | return "\t".join(parts) |
28 | 50 | return line |
29 | 51 |
|
30 | 52 |
|
31 | | -def process_files(source_file: str, destination_file: str) -> None: |
32 | | - src_path = Path(source_file) |
33 | | - if not src_path.is_file(): |
34 | | - raise FileNotFoundError(f"Source file not found: {src_path}") |
35 | | - dest_path = Path(destination_file) |
36 | | - if not dest_path.is_file(): |
37 | | - raise FileNotFoundError(f"Destination file not found: {dest_path}") |
| 53 | +def process_files(source_file: Path, destination_file: Path) -> None: |
| 54 | + """Synchronize matrix headers and sanitize taxonomic profiles atomically. |
| 55 | +
|
| 56 | + Extracts clean cohort descriptors from the header of a source tracker, |
| 57 | + applies string cleaning to a targeted taxonomy mapping spreadsheet, |
| 58 | + and updates the destination file utilizing atomic replacement blocks. |
38 | 59 |
|
39 | | - # Read the first line from the source file and modify it |
40 | | - with open(src_path, "r") as file: |
41 | | - first_line_source = file.readline() |
42 | | - modified_first_line = "\t".join( |
| 60 | + Args: |
| 61 | + source_file: Validated Path to the template matrix containing pristine headers. |
| 62 | + destination_file: Target Path to the file undergoing line-by-line taxonomy cleaning. |
| 63 | +
|
| 64 | + Raises: |
| 65 | + FileNotFoundError: Triggered if either the source or destination targets are absent. |
| 66 | + """ |
| 67 | + if not source_file.is_file(): |
| 68 | + raise FileNotFoundError(f"Source file not found: {source_file}") |
| 69 | + if not destination_file.is_file(): |
| 70 | + raise FileNotFoundError(f"Destination file not found: {destination_file}") |
| 71 | + |
| 72 | + # Step 1: Read and truncate raw pipeline suffixes from sample headers |
| 73 | + with open(source_file, "r", encoding="utf-8") as file: |
| 74 | + first_line_source: str = file.readline() |
| 75 | + |
| 76 | + modified_first_line: str = "\t".join( |
43 | 77 | word.split(".")[0] for word in first_line_source.split() |
44 | 78 | ) |
45 | 79 |
|
46 | | - # Read all content from the destination file and modify taxa names |
47 | | - with open(dest_path, "r") as file: |
48 | | - lines = file.readlines() |
49 | | - modified_lines = [modify_taxa_names(line.strip()) for line in lines] |
| 80 | + # Step 2: Read targets and map taxonomic updates lazily across lists |
| 81 | + with open(destination_file, "r", encoding="utf-8") as file: |
| 82 | + lines: list[str] = file.readlines() |
| 83 | + |
| 84 | + modified_lines: list[str] = [modify_taxa_names(line.strip()) for line in lines] |
50 | 85 |
|
51 | | - # Combine the modified first line with the modified content of the destination file |
52 | | - updated_content = modified_first_line + "\n" + "\n".join(modified_lines) |
| 86 | + # Step 3: Integrate matrices and commit layout modifications to disk |
| 87 | + joined_lines: str = "\n".join(modified_lines) |
| 88 | + updated_content: str = f"{modified_first_line}\n{joined_lines}" |
53 | 89 |
|
54 | | - # Write atomically: write to a temp file in the same directory, then replace |
| 90 | + # Secure atomic writer operations targeting adjacent scratch space regions |
55 | 91 | with tempfile.NamedTemporaryFile( |
56 | | - mode="w", dir=dest_path.parent, delete=False, suffix=".tmp" |
| 92 | + mode="w", |
| 93 | + dir=destination_file.parent, |
| 94 | + delete=False, |
| 95 | + suffix=".tmp", |
| 96 | + encoding="utf-8", |
57 | 97 | ) as tmp: |
58 | 98 | tmp.write(updated_content) |
59 | | - tmp_path = tmp.name |
60 | | - os.replace(tmp_path, dest_path) |
| 99 | + tmp_path: str = tmp.name |
61 | 100 |
|
62 | | - _log.info(f"Processed {destination_file} successfully.") |
| 101 | + # Commit transactions atomically across POSIX virtual environments |
| 102 | + os.replace(tmp_path, destination_file) |
| 103 | + _log.info("Processed '%s' successfully.", destination_file) |
63 | 104 |
|
64 | 105 |
|
65 | 106 | @app.callback(invoke_without_command=True) |
66 | 107 | def main( |
67 | 108 | ctx: typer.Context, |
68 | | - input_file: Optional[str] = typer.Option( |
| 109 | + input_file: Optional[Path] = typer.Option( |
69 | 110 | None, |
70 | 111 | "-i", |
71 | 112 | "--input", |
72 | | - help="Path to the source file. This file's first line will be read and modified.", |
| 113 | + help="Path to the source file (used to extract and truncate header labels).", |
73 | 114 | ), |
74 | | - output_file: Optional[str] = typer.Option( |
| 115 | + output_file: Optional[Path] = typer.Option( |
75 | 116 | None, |
76 | 117 | "-o", |
77 | 118 | "--output", |
78 | | - help="Path to the destination file. This file's contents will be updated with cleaned taxa names.", |
| 119 | + help="Path to the destination matrix undergoing taxonomic name sanitation.", |
79 | 120 | ), |
80 | 121 | ) -> None: |
81 | 122 | """Reads a source file, processes its first line, modifies taxa names in a destination file, and updates it.""" |
|
0 commit comments