44
55 gwascatalog beyondsnp validate INPUT [INPUT ...] --type {CNV,GENE} [OPTIONS]
66
7- Accepts individual files, directories (all regular files inside are
8- processed), or quoted glob patterns. Run with ``--help`` for the full
9- list of options.
7+ Accepts a list of files. Run with ``--help`` for the full list of options.
108"""
119
1210from __future__ import annotations
1311
1412import argparse
1513import logging
1614import sys
17- from concurrent .futures import ProcessPoolExecutor , as_completed
1815from pathlib import Path
1916from typing import TYPE_CHECKING
2017
18+ from gwascatalog .sumstatlib import SumstatConfig
19+
2120from gwascatalog .sumstatapp .cli ._validate import FileResult , validate_file
2221
2322if TYPE_CHECKING :
2423 from collections .abc import Sequence
2524
2625logger = logging .getLogger (__name__ )
27-
26+ logging .basicConfig (
27+ level = logging .INFO ,
28+ format = "%(asctime)s [%(levelname)s] [%(module)s] %(message)s" ,
29+ datefmt = "%Y-%m-%d %H:%M:%S" ,
30+ )
2831
2932# ── Input resolution ──────────────────────────────────────────────
3033
3134
3235def _resolve_inputs (raw : list [str ]) -> list [Path ]:
33- """Turn user-supplied paths, directories, and glob patterns into files.
34-
35- Resolution order for each *raw* entry:
36-
37- 1. If it names an existing **file**, use it directly.
38- 2. If it names an existing **directory**, take every regular
39- (non-hidden) file inside it.
40- 3. Otherwise, treat it as a **glob pattern** (supports ``**``).
41- """
36+ """Turn user-supplied paths into files."""
4237 resolved : list [Path ] = []
4338
4439 for entry in raw :
4540 p = Path (entry )
46-
4741 if p .is_file ():
4842 resolved .append (p .resolve ())
49- elif p .is_dir ():
50- resolved .extend (
51- sorted (
52- f .resolve ()
53- for f in p .iterdir ()
54- if f .is_file () and not f .name .startswith ("." )
55- )
56- )
5743 else :
58- matches = sorted (m .resolve () for m in Path ().glob (entry ))
59- files = [m for m in matches if m .is_file ()]
60- if not files :
61- logger .warning (
62- f"{ entry } did not match any files" ,
63- )
64- resolved .extend (files )
44+ logger .warning (f"Skipping { entry } : not a file" )
6545
6646 # Deduplicate while preserving order
6747 seen : set [Path ] = set ()
@@ -82,7 +62,7 @@ def _add_validate_args(parser: argparse.ArgumentParser) -> None:
8262 "inputs" ,
8363 nargs = "+" ,
8464 metavar = "INPUT" ,
85- help = "Files, directories, or glob patterns to validate" ,
65+ help = "Files to validate" ,
8666 )
8767 parser .add_argument (
8868 "--type" ,
@@ -117,18 +97,6 @@ def _add_validate_args(parser: argparse.ArgumentParser) -> None:
11797 default = Path ("validated" ),
11898 help = "Output directory for results (default: ./validated/)" ,
11999 )
120- parser .add_argument (
121- "-w" ,
122- "--workers" ,
123- type = int ,
124- default = 1 ,
125- metavar = "N" ,
126- help = (
127- "Number of parallel worker processes. "
128- "Use 1 (default) for sequential execution, "
129- "which is easier to debug."
130- ),
131- )
132100 parser .set_defaults (func = _run_validate )
133101
134102
@@ -155,7 +123,7 @@ def _build_parser() -> argparse.ArgumentParser:
155123 "Validate GWAS summary statistics files for submission to the GWAS Catalog."
156124 ),
157125 epilog = (
158- "Example: gwascatalog beyondsnp validate data/* .tsv "
126+ "Example: gwascatalog beyondsnp validate data/file .tsv "
159127 "--type GENE --assembly GRCh38"
160128 ),
161129 )
@@ -217,58 +185,31 @@ def _run_validate(args: argparse.Namespace, parser: argparse.ArgumentParser) ->
217185 output_dir : Path = args .output_dir .resolve ()
218186 output_dir .mkdir (parents = True , exist_ok = True )
219187
220- workers : int = max (1 , args .workers )
221-
222- print (f"Validating { len (files )} file(s) with { workers } worker(s)" )
188+ print (f"Validating { len (files )} file(s)" )
223189 print (f"Output: { output_dir } \n " )
224190
225- common_kwargs : dict [str , object ] = {
226- "output_dir" : str (output_dir ),
227- "variation_type" : args .variation_type ,
228- "assembly" : args .assembly ,
229- "primary_effect_size" : args .primary_effect_size ,
230- "allow_zero_pvalues" : args .allow_zero_pvalues ,
231- }
191+ config = SumstatConfig (
192+ allow_zero_p_values = args .allow_zero_pvalues ,
193+ primary_effect_size = args .primary_effect_size ,
194+ assembly = args .assembly ,
195+ )
232196
233197 # ── Dispatch ──────────────────────────────────────────────
234198 results : list [FileResult ] = []
235199
236- if workers == 1 :
237- # Sequential — simple and easy to debug
238- for f in files :
239- print (f" { f .name } ..." , end = " " , flush = True )
240- result = validate_file (input_path = str (f ), ** common_kwargs )
241- passed = not result .fatal_error and result .error_count == 0
242- print ("PASS" if passed else "FAIL" )
243- results .append (result )
244- else :
245- with ProcessPoolExecutor (max_workers = workers ) as pool :
246- future_to_path = {
247- pool .submit (validate_file , input_path = str (f ), ** common_kwargs ): f
248- for f in files
249- }
250- for future in as_completed (future_to_path ):
251- f = future_to_path [future ]
252- try :
253- result = future .result ()
254- except Exception as exc : # noqa: BLE001
255- logger .exception ("Unexpected error for %s" , f .name )
256- result = FileResult (
257- input_path = f ,
258- output_path = None ,
259- error_path = None ,
260- rows_processed = 0 ,
261- valid_count = 0 ,
262- error_count = 0 ,
263- elapsed_seconds = 0.0 ,
264- md5_checksum = None ,
265- fatal_error = str (exc ),
266- )
267- passed = not result .fatal_error and result .error_count == 0
268- print (f" [{ 'PASS' if passed else 'FAIL' } ] { f .name } " )
269- results .append (result )
270-
271- # ── Summary ───────────────────────────────────────────────
200+ # Sequential — simple and easy to debug
201+ for f in files :
202+ print (f" { f .name } ..." , end = " " , flush = True )
203+ result = validate_file (
204+ input_path = str (f ),
205+ config = config ,
206+ output_dir = output_dir ,
207+ variation_type = args .variation_type ,
208+ )
209+ passed = not result .fatal_error and result .error_count == 0
210+ print ("PASS" if passed else "FAIL" )
211+ results .append (result )
212+
272213 _print_summary (results )
273214
274215 has_failures = any (r .error_count > 0 or r .fatal_error is not None for r in results )
0 commit comments