add CLI integration tests

nebfield · nebfield · commit 8dc72a00f5e9 · 2026-03-09T17:09:46.000Z
diff --git a/src/gwascatalog/sumstatapp/cli/__main__.py b/src/gwascatalog/sumstatapp/cli/__main__.py
@@ -219,16 +219,6 @@ def _run_validate(args: argparse.Namespace, parser: argparse.ArgumentParser) ->
 
     workers: int = max(1, args.workers)
 
-    # Warn about duplicate stems that would clobber output files
-    from gwascatalog.sumstatapp.cli._validate import output_stem
-
-    stems = [output_stem(f) for f in files]
-    if len(stems) != len(set(stems)):
-        print(
-            "WARNING: Duplicate file stems detected — output files may be overwritten",
-            file=sys.stderr,
-        )
-
     print(f"Validating {len(files)} file(s) with {workers} worker(s)")
     print(f"Output: {output_dir}\n")
 
diff --git a/src/gwascatalog/sumstatapp/cli/_validate.py b/src/gwascatalog/sumstatapp/cli/_validate.py
@@ -12,7 +12,7 @@
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 from gwascatalog.sumstatlib import (
     CNVSumstatModel,
@@ -60,29 +60,14 @@ def _get_model(variation_type: str) -> type[CNVSumstatModel] | type[GeneSumstatM
             raise ValueError(f"Unsupported variation type: {variation_type}")
 
 
-def output_stem(path: Path) -> str:
-    """Derive an output file stem, stripping archive and tabular extensions.
-
-    Examples::
-
-        output_stem(Path("study.tsv.gz")) == "study"
-        output_stem(Path("study.tsv"))    == "study"
-        output_stem(Path("study.csv.gz")) == "study"
-    """
-    stem = path.stem
-    if path.suffix == ".gz":
-        stem = Path(stem).stem
-    return stem
-
-
 def _write_error_report(error_path: Path, errors: list[SumstatError]) -> None:
     """Write validation errors to a human-readable TSV file."""
+    dict_errors = [dict(e) for e in errors]
+
     with error_path.open("w", encoding="utf-8", newline="") as f:
-        writer = csv.writer(f, delimiter="\t")
-        writer.writerow(["row", "column", "message"])
-        for e in errors:
-            column = e["loc"] if e["loc"] is not None else ""
-            writer.writerow([e["row"], column, e["msg"]])
+        writer = csv.DictWriter(f, delimiter="\t", fieldnames=["row", "column", "msg"])
+        writer.writeheader()
+        writer.writerows(dict_errors)
 
 
 def _compute_md5(path: Path) -> str:
@@ -102,7 +87,8 @@ def validate_file(
     output_dir: str,
     variation_type: str,
     assembly: str | None,
-    primary_effect_size: str | None,
+    primary_effect_size: Literal["beta", "odds_ratio", "hazard_ratio", "z_score"]
+    | None,
     allow_zero_pvalues: bool,
 ) -> FileResult:
     """Validate a single summary statistics file and write results.
@@ -115,9 +101,13 @@ def validate_file(
     """
     inp = Path(input_path)
     out_dir = Path(output_dir)
-    stem = output_stem(inp)
-    output_path = out_dir / f"{stem}.tsv.gz"
-    error_path = out_dir / f"{stem}.errors.tsv"
+    output_path = out_dir / f"validated_{inp.stem}.tsv.gz"
+    error_path = out_dir / f"{inp.stem}.errors.tsv"
+
+    if output_path.exists():
+        raise FileExistsError(output_path)
+    if error_path.exists():
+        raise FileExistsError(error_path)
 
     start = time.monotonic()
 
@@ -133,8 +123,9 @@ def validate_file(
 
         rows_processed = 0
         valid_count = 0
+        writer = table.open_writer(output_path, compress=True)
 
-        for row in table.open_writer(output_path, compress=True):
+        for row in writer:
             rows_processed += 1
             if row.is_valid:
                 valid_count += 1
diff --git a/src/gwascatalog/sumstatapp/web/static/examples/invalid-cnv.csv b/src/gwascatalog/sumstatapp/web/static/examples/invalid-cnv.csv
@@ -0,0 +1,3 @@
+﻿romosome,base_pair_start,base_pair_end,neg_log10_p_value,p_value,beta,standard_error,statistical_model_type
+1,16600001,,9.45,3.54813E-10,0.048,0.008,additive
+,86415001,86425000,13.661,2.18273E-14,-0.035,,additive
diff --git a/src/gwascatalog/sumstatapp/web/static/examples/invalid-gene.csv b/src/gwascatalog/sumstatapp/web/static/examples/invalid-gene.csv
@@ -0,0 +1,3 @@
+hgnc_symbol,ensembl_gene_id,p_value
+ISG20,ENSG00000172183,0.0001
+,ENSG00000128886,
diff --git a/src/gwascatalog/sumstatapp/web/static/examples/valid-cnv.csv b/src/gwascatalog/sumstatapp/web/static/examples/valid-cnv.csv
@@ -0,0 +1,3 @@
+﻿chromosome,base_pair_start,base_pair_end,neg_log10_p_value,beta,standard_error,statistical_model_type,extra_test_column
+1,16600001,16605000,9.45,0.048,0.008,additive,test1
+X,86415001,86425000,13.661,-0.035,0.003,additive,test2
diff --git a/src/gwascatalog/sumstatapp/web/static/examples/valid-gene.csv b/src/gwascatalog/sumstatapp/web/static/examples/valid-gene.csv
@@ -0,0 +1,3 @@
+ensembl_gene_id,p_value,chromosome,base_pair_start,base_pair_end,beta,standard_error,extra_test_column
+ENSG00000128886,0.1,15,43772605,43777315,0.420,2,test1
+ENSG00000172183,0.0001,15,88635618,88656483,0.048,0.0006,test2
diff --git a/sumstatlib/src/gwascatalog/sumstatlib/cnv/models.py b/sumstatlib/src/gwascatalog/sumstatlib/cnv/models.py
@@ -43,7 +43,7 @@ class CNVSumstatModel(BaseSumstatModel):
       - allow_zero_pvalues (bool, optional):
     """
 
-    MIN_RECORDS: ClassVar[None] = MIN_CNV_RECORDS
+    MIN_RECORDS: ClassVar[int] = MIN_CNV_RECORDS
     FIELD_MAP: ClassVar[Mapping[str, int]] = CNV_FIELD_INDEX_MAP
     VALID_FIELD_NAMES: ClassVar[list[str]] = list(CNV_FIELD_INDEX_MAP.keys())
 
diff --git a/sumstatlib/src/gwascatalog/sumstatlib/constants.py b/sumstatlib/src/gwascatalog/sumstatlib/constants.py
@@ -47,4 +47,4 @@
 
 # see decision docs for justification
 MIN_GENE_RECORDS: Final[int] = 10_000
-MIN_CNV_RECORDS: Final = None
+MIN_CNV_RECORDS: Final[int] = 10_000
diff --git a/sumstatlib/src/gwascatalog/sumstatlib/sumstattable.py b/sumstatlib/src/gwascatalog/sumstatlib/sumstattable.py
@@ -27,15 +27,15 @@ class SumstatConfig(TypedDict):
     """Runtime configuration for validating summary stats"""
 
     allow_zero_p_values: bool
-    assembly: GenomeAssembly
+    assembly: GenomeAssembly | None
     primary_effect_size: Literal["beta", "odds_ratio", "hazard_ratio", "z_score"] | None
 
 
 class SumstatError(TypedDict):
     """A parsed pydantic ValidationError"""
 
     row: int
-    loc: int | None
+    column: str | int | None
     msg: str
 
 
@@ -59,7 +59,6 @@ def __init__(
         data_model: type[CNVSumstatModel | GeneSumstatModel],
         input_path: Path,
         config: SumstatConfig,
-        min_records: int | None = None,
     ):
         self._data_model = data_model
         self._path = Path(input_path)
@@ -69,35 +68,36 @@ def __init__(
         if not self._path.exists():
             raise FileNotFoundError(self._path)
 
-        if min_records is None:
-            self._min_records = self._data_model.MIN_RECORDS
-        else:
-            self._min_records = min_records
-
         n_rows = self.n_rows
-        if self._min_records is not None and n_rows < self._min_records:
-            raise ValueError(f"Not enough rows in file: {n_rows=} {self._min_records=}")
+        if n_rows < self.data_model.MIN_RECORDS:
+            warning = f"""
+            It looks like you only have {n_rows} rows in {self._path}.
+            {self.data_model} recommends at least {self.data_model.MIN_RECORDS} (before
+            any QC steps). Please include all results, not just top hits.
+            The GWAS Catalog inclusion criteria requires studies to be genome-wide.
+            Please get in touch with gwas-subs@ebi.ac.uk if you have any questions.
+            """
+            logger.warning(warning)
 
         # Validate first row to check column structure — fail fast on bad columns
         _ = self.output_fieldnames
 
     def _open_sumstat(self) -> IO[str]:
+        # don't forget to strip UTF-8 BOM from Excel-exported files
+        # newline = "" is best for CSV files - let the dictreader parser handle it
         if _is_gzip(self._path):
-            return gzip.open(self._path, "rt", encoding="utf-8", newline=None)
-        return self._path.open(mode="rt", encoding="utf-8", newline=None)
+            return gzip.open(self._path, "rt", encoding="utf-8-sig", newline="")
+        return self._path.open(mode="rt", encoding="utf-8-sig", newline="")
 
     def parse_csv(self, sample_size: int = 4096) -> Generator[dict]:
         """Automatically detect CSV delimiter and yield each row as a dict"""
         with self._open_sumstat() as f:
             sample = f.read(sample_size)
             sniffer = csv.Sniffer()
             dialect = sniffer.sniff(sample, delimiters=",\t;| ")
-
-            if not sniffer.has_header(sample):
-                raise ValueError("file doesn't appear to contain a header")
-
-            f.seek(0)  # reset to start of the file
+            f.seek(0)
             reader = csv.DictReader(f, dialect=dialect)
+
             yield from reader
 
     @cached_property
@@ -119,21 +119,10 @@ def output_fieldnames(self) -> list[str]:
             ValidationError: If the first row fails validation, indicating
                 an invalid column set (e.g. missing required columns).
         """
-        first_row = next(self.parse_csv())
-        try:
-            instance = self._data_model.model_validate(first_row, context=self._config)
-        except ValidationError as e:
-            logger.critical(f"First row of {self._path.name} failed validation")
-            logger.critical(f"{ValidationError}")
-            msg = (
-                f"The first row of {self._path.name} failed validation. "
-                "This usually means the file has missing or incorrectly "
-                "named columns. Valid column names include: "
-                f"{self.data_model.VALID_FIELD_NAMES}"
-            )
-            raise ValueError(msg) from e
+        present = next(self.parse_csv(), None)
+        if present is None:
+            raise ValueError(f"Can't read anything from {self._path}")
 
-        present = list(instance.model_dump(exclude_none=True).keys())
         field_map = self._data_model.FIELD_MAP
 
         # get a list fields sorted by their field map index
@@ -157,30 +146,6 @@ def n_rows(self) -> int:
             next(f, None)  # skip header
             return sum(1 for _ in f)
 
-    def validate_rows(self) -> Generator[dict]:
-        """Validate all rows, storing errors in self._errors and yielding validated
-        rows.
-        """
-        for i, row in enumerate(self.parse_csv()):
-            try:
-                validated = self._data_model.model_validate(
-                    row, context=self._config
-                ).model_dump()
-            except ValidationError as exc:
-                for error in exc.errors():
-                    location = int(error["loc"][0])
-                    self._errors.append(
-                        SumstatError(row=i, loc=location, msg=error["msg"])
-                    )
-
-                if len(self._errors) >= self.MAX_ERRORS:
-                    logger.critical(
-                        f"Stopped validation after {self.MAX_ERRORS} errors"
-                    )
-                    break
-            else:
-                yield validated
-
     @property
     def errors(self) -> list[SumstatError]:
         """Return all row errors encountered"""
@@ -259,11 +224,11 @@ def __iter__(self) -> Generator[ValidatedRow]:
             except ValidationError as exc:
                 for error in exc.errors():
                     try:
-                        location = int(error["loc"][0])
+                        location = error["loc"][0]
                     except IndexError:
                         location = None
                     self._table.add_error(
-                        SumstatError(row=i, loc=location, msg=error["msg"])
+                        SumstatError(row=i, column=location, msg=error["msg"])
                     )
                 yield ValidatedRow(row_number=i, is_valid=False)
 
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/integration/test_cli_cnv.py b/tests/integration/test_cli_cnv.py
diff --git a/tests/integration/test_cli_gene.py b/tests/integration/test_cli_gene.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+romosome,base_pair_start,base_pair_end,neg_log10_p_value,p_value,beta,standard_error,statistical_model_type`
	`2`	`+1,16600001,,9.45,3.54813E-10,0.048,0.008,additive`
	`3`	`+,86415001,86425000,13.661,2.18273E-14,-0.035,,additive`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+hgnc_symbol,ensembl_gene_id,p_value`
	`2`	`+ISG20,ENSG00000172183,0.0001`
	`3`	`+,ENSG00000128886,`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+chromosome,base_pair_start,base_pair_end,neg_log10_p_value,beta,standard_error,statistical_model_type,extra_test_column`
	`2`	`+1,16600001,16605000,9.45,0.048,0.008,additive,test1`
	`3`	`+X,86415001,86425000,13.661,-0.035,0.003,additive,test2`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+ensembl_gene_id,p_value,chromosome,base_pair_start,base_pair_end,beta,standard_error,extra_test_column`
	`2`	`+ENSG00000128886,0.1,15,43772605,43777315,0.420,2,test1`
	`3`	`+ENSG00000172183,0.0001,15,88635618,88656483,0.048,0.0006,test2`