Skip to content

Commit 9a32249

Browse files
committed
simplify file handling
1 parent 8dc72a0 commit 9a32249

File tree

2 files changed

+35
-110
lines changed

2 files changed

+35
-110
lines changed

src/gwascatalog/sumstatapp/cli/__main__.py

Lines changed: 31 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -4,64 +4,44 @@
44
55
gwascatalog beyondsnp validate INPUT [INPUT ...] --type {CNV,GENE} [OPTIONS]
66
7-
Accepts individual files, directories (all regular files inside are
8-
processed), or quoted glob patterns. Run with ``--help`` for the full
9-
list of options.
7+
Accepts a list of files. Run with ``--help`` for the full list of options.
108
"""
119

1210
from __future__ import annotations
1311

1412
import argparse
1513
import logging
1614
import sys
17-
from concurrent.futures import ProcessPoolExecutor, as_completed
1815
from pathlib import Path
1916
from typing import TYPE_CHECKING
2017

18+
from gwascatalog.sumstatlib import SumstatConfig
19+
2120
from gwascatalog.sumstatapp.cli._validate import FileResult, validate_file
2221

2322
if TYPE_CHECKING:
2423
from collections.abc import Sequence
2524

2625
logger = logging.getLogger(__name__)
27-
26+
logging.basicConfig(
27+
level=logging.INFO,
28+
format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
29+
datefmt="%Y-%m-%d %H:%M:%S",
30+
)
2831

2932
# ── Input resolution ──────────────────────────────────────────────
3033

3134

3235
def _resolve_inputs(raw: list[str]) -> list[Path]:
33-
"""Turn user-supplied paths, directories, and glob patterns into files.
34-
35-
Resolution order for each *raw* entry:
36-
37-
1. If it names an existing **file**, use it directly.
38-
2. If it names an existing **directory**, take every regular
39-
(non-hidden) file inside it.
40-
3. Otherwise, treat it as a **glob pattern** (supports ``**``).
41-
"""
36+
"""Turn user-supplied paths into files."""
4237
resolved: list[Path] = []
4338

4439
for entry in raw:
4540
p = Path(entry)
46-
4741
if p.is_file():
4842
resolved.append(p.resolve())
49-
elif p.is_dir():
50-
resolved.extend(
51-
sorted(
52-
f.resolve()
53-
for f in p.iterdir()
54-
if f.is_file() and not f.name.startswith(".")
55-
)
56-
)
5743
else:
58-
matches = sorted(m.resolve() for m in Path().glob(entry))
59-
files = [m for m in matches if m.is_file()]
60-
if not files:
61-
logger.warning(
62-
f"{entry} did not match any files",
63-
)
64-
resolved.extend(files)
44+
logger.warning(f"Skipping {entry}: not a file")
6545

6646
# Deduplicate while preserving order
6747
seen: set[Path] = set()
@@ -82,7 +62,7 @@ def _add_validate_args(parser: argparse.ArgumentParser) -> None:
8262
"inputs",
8363
nargs="+",
8464
metavar="INPUT",
85-
help="Files, directories, or glob patterns to validate",
65+
help="Files to validate",
8666
)
8767
parser.add_argument(
8868
"--type",
@@ -117,18 +97,6 @@ def _add_validate_args(parser: argparse.ArgumentParser) -> None:
11797
default=Path("validated"),
11898
help="Output directory for results (default: ./validated/)",
11999
)
120-
parser.add_argument(
121-
"-w",
122-
"--workers",
123-
type=int,
124-
default=1,
125-
metavar="N",
126-
help=(
127-
"Number of parallel worker processes. "
128-
"Use 1 (default) for sequential execution, "
129-
"which is easier to debug."
130-
),
131-
)
132100
parser.set_defaults(func=_run_validate)
133101

134102

@@ -155,7 +123,7 @@ def _build_parser() -> argparse.ArgumentParser:
155123
"Validate GWAS summary statistics files for submission to the GWAS Catalog."
156124
),
157125
epilog=(
158-
"Example: gwascatalog beyondsnp validate data/*.tsv "
126+
"Example: gwascatalog beyondsnp validate data/file.tsv "
159127
"--type GENE --assembly GRCh38"
160128
),
161129
)
@@ -217,58 +185,31 @@ def _run_validate(args: argparse.Namespace, parser: argparse.ArgumentParser) ->
217185
output_dir: Path = args.output_dir.resolve()
218186
output_dir.mkdir(parents=True, exist_ok=True)
219187

220-
workers: int = max(1, args.workers)
221-
222-
print(f"Validating {len(files)} file(s) with {workers} worker(s)")
188+
print(f"Validating {len(files)} file(s)")
223189
print(f"Output: {output_dir}\n")
224190

225-
common_kwargs: dict[str, object] = {
226-
"output_dir": str(output_dir),
227-
"variation_type": args.variation_type,
228-
"assembly": args.assembly,
229-
"primary_effect_size": args.primary_effect_size,
230-
"allow_zero_pvalues": args.allow_zero_pvalues,
231-
}
191+
config = SumstatConfig(
192+
allow_zero_p_values=args.allow_zero_pvalues,
193+
primary_effect_size=args.primary_effect_size,
194+
assembly=args.assembly,
195+
)
232196

233197
# ── Dispatch ──────────────────────────────────────────────
234198
results: list[FileResult] = []
235199

236-
if workers == 1:
237-
# Sequential — simple and easy to debug
238-
for f in files:
239-
print(f" {f.name} ...", end=" ", flush=True)
240-
result = validate_file(input_path=str(f), **common_kwargs)
241-
passed = not result.fatal_error and result.error_count == 0
242-
print("PASS" if passed else "FAIL")
243-
results.append(result)
244-
else:
245-
with ProcessPoolExecutor(max_workers=workers) as pool:
246-
future_to_path = {
247-
pool.submit(validate_file, input_path=str(f), **common_kwargs): f
248-
for f in files
249-
}
250-
for future in as_completed(future_to_path):
251-
f = future_to_path[future]
252-
try:
253-
result = future.result()
254-
except Exception as exc: # noqa: BLE001
255-
logger.exception("Unexpected error for %s", f.name)
256-
result = FileResult(
257-
input_path=f,
258-
output_path=None,
259-
error_path=None,
260-
rows_processed=0,
261-
valid_count=0,
262-
error_count=0,
263-
elapsed_seconds=0.0,
264-
md5_checksum=None,
265-
fatal_error=str(exc),
266-
)
267-
passed = not result.fatal_error and result.error_count == 0
268-
print(f" [{'PASS' if passed else 'FAIL'}] {f.name}")
269-
results.append(result)
270-
271-
# ── Summary ───────────────────────────────────────────────
200+
# Sequential — simple and easy to debug
201+
for f in files:
202+
print(f" {f.name} ...", end=" ", flush=True)
203+
result = validate_file(
204+
input_path=str(f),
205+
config=config,
206+
output_dir=output_dir,
207+
variation_type=args.variation_type,
208+
)
209+
passed = not result.fatal_error and result.error_count == 0
210+
print("PASS" if passed else "FAIL")
211+
results.append(result)
212+
272213
_print_summary(results)
273214

274215
has_failures = any(r.error_count > 0 or r.fatal_error is not None for r in results)

src/gwascatalog/sumstatapp/cli/_validate.py

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
1-
"""Worker function for validating a single summary statistics file.
2-
3-
Kept in a standalone module so it is importable (and therefore picklable)
4-
by :class:`~concurrent.futures.ProcessPoolExecutor`.
5-
"""
1+
"""Worker function for validating a single summary statistics file."""
62

73
from __future__ import annotations
84

@@ -12,12 +8,11 @@
128
import time
139
from dataclasses import dataclass
1410
from pathlib import Path
15-
from typing import TYPE_CHECKING, Literal
11+
from typing import TYPE_CHECKING
1612

1713
from gwascatalog.sumstatlib import (
1814
CNVSumstatModel,
1915
GeneSumstatModel,
20-
GenomeAssembly,
2116
SumstatConfig,
2217
SumstatTable,
2318
)
@@ -84,18 +79,12 @@ def _compute_md5(path: Path) -> str:
8479

8580
def validate_file(
8681
input_path: str,
87-
output_dir: str,
82+
output_dir: str | Path,
8883
variation_type: str,
89-
assembly: str | None,
90-
primary_effect_size: Literal["beta", "odds_ratio", "hazard_ratio", "z_score"]
91-
| None,
92-
allow_zero_pvalues: bool,
84+
config: SumstatConfig,
9385
) -> FileResult:
9486
"""Validate a single summary statistics file and write results.
9587
96-
All arguments use primitive types so the function can be dispatched to a
97-
:class:`~concurrent.futures.ProcessPoolExecutor` without pickling issues.
98-
9988
Returns:
10089
A :class:`FileResult` summarising the outcome.
10190
"""
@@ -113,11 +102,6 @@ def validate_file(
113102

114103
try:
115104
model = _get_model(variation_type)
116-
config = SumstatConfig(
117-
allow_zero_p_values=allow_zero_pvalues,
118-
assembly=GenomeAssembly(assembly) if assembly else None,
119-
primary_effect_size=primary_effect_size,
120-
)
121105

122106
table = SumstatTable(data_model=model, input_path=inp, config=config)
123107

0 commit comments

Comments
 (0)