@@ -27,15 +27,15 @@ class SumstatConfig(TypedDict):
2727 """Runtime configuration for validating summary stats"""
2828
2929 allow_zero_p_values : bool
30- assembly : GenomeAssembly
30+ assembly : GenomeAssembly | None
3131 primary_effect_size : Literal ["beta" , "odds_ratio" , "hazard_ratio" , "z_score" ] | None
3232
3333
3434class SumstatError (TypedDict ):
3535 """A parsed pydantic ValidationError"""
3636
3737 row : int
38- loc : int | None
38+ column : str | int | None
3939 msg : str
4040
4141
@@ -59,7 +59,6 @@ def __init__(
5959 data_model : type [CNVSumstatModel | GeneSumstatModel ],
6060 input_path : Path ,
6161 config : SumstatConfig ,
62- min_records : int | None = None ,
6362 ):
6463 self ._data_model = data_model
6564 self ._path = Path (input_path )
@@ -69,35 +68,36 @@ def __init__(
6968 if not self ._path .exists ():
7069 raise FileNotFoundError (self ._path )
7170
72- if min_records is None :
73- self ._min_records = self ._data_model .MIN_RECORDS
74- else :
75- self ._min_records = min_records
76-
7771 n_rows = self .n_rows
78- if self ._min_records is not None and n_rows < self ._min_records :
79- raise ValueError (f"Not enough rows in file: { n_rows = } { self ._min_records = } " )
72+ if n_rows < self .data_model .MIN_RECORDS :
73+ warning = f"""
74+ It looks like you only have { n_rows } rows in { self ._path } .
75+ { self .data_model } recommends at least { self .data_model .MIN_RECORDS } (before
76+ any QC steps). Please include all results, not just top hits.
77+ The GWAS Catalog inclusion criteria requires studies to be genome-wide.
78+ Please get in touch with gwas-subs@ebi.ac.uk if you have any questions.
79+ """
80+ logger .warning (warning )
8081
8182 # Validate first row to check column structure — fail fast on bad columns
8283 _ = self .output_fieldnames
8384
8485 def _open_sumstat (self ) -> IO [str ]:
86+ # don't forget to strip UTF-8 BOM from Excel-exported files
87+ # newline = "" is best for CSV files - let the dictreader parser handle it
8588 if _is_gzip (self ._path ):
86- return gzip .open (self ._path , "rt" , encoding = "utf-8" , newline = None )
87- return self ._path .open (mode = "rt" , encoding = "utf-8" , newline = None )
89+ return gzip .open (self ._path , "rt" , encoding = "utf-8-sig " , newline = "" )
90+ return self ._path .open (mode = "rt" , encoding = "utf-8-sig " , newline = "" )
8891
8992 def parse_csv (self , sample_size : int = 4096 ) -> Generator [dict ]:
9093 """Automatically detect CSV delimiter and yield each row as a dict"""
9194 with self ._open_sumstat () as f :
9295 sample = f .read (sample_size )
9396 sniffer = csv .Sniffer ()
9497 dialect = sniffer .sniff (sample , delimiters = ",\t ;| " )
95-
96- if not sniffer .has_header (sample ):
97- raise ValueError ("file doesn't appear to contain a header" )
98-
99- f .seek (0 ) # reset to start of the file
98+ f .seek (0 )
10099 reader = csv .DictReader (f , dialect = dialect )
100+
101101 yield from reader
102102
103103 @cached_property
@@ -119,21 +119,10 @@ def output_fieldnames(self) -> list[str]:
119119 ValidationError: If the first row fails validation, indicating
120120 an invalid column set (e.g. missing required columns).
121121 """
122- first_row = next (self .parse_csv ())
123- try :
124- instance = self ._data_model .model_validate (first_row , context = self ._config )
125- except ValidationError as e :
126- logger .critical (f"First row of { self ._path .name } failed validation" )
127- logger .critical (f"{ ValidationError } " )
128- msg = (
129- f"The first row of { self ._path .name } failed validation. "
130- "This usually means the file has missing or incorrectly "
131- "named columns. Valid column names include: "
132- f"{ self .data_model .VALID_FIELD_NAMES } "
133- )
134- raise ValueError (msg ) from e
122+ present = next (self .parse_csv (), None )
123+ if present is None :
124+ raise ValueError (f"Can't read anything from { self ._path } " )
135125
136- present = list (instance .model_dump (exclude_none = True ).keys ())
137126 field_map = self ._data_model .FIELD_MAP
138127
139128 # get a list fields sorted by their field map index
@@ -157,30 +146,6 @@ def n_rows(self) -> int:
157146 next (f , None ) # skip header
158147 return sum (1 for _ in f )
159148
160- def validate_rows (self ) -> Generator [dict ]:
161- """Validate all rows, storing errors in self._errors and yielding validated
162- rows.
163- """
164- for i , row in enumerate (self .parse_csv ()):
165- try :
166- validated = self ._data_model .model_validate (
167- row , context = self ._config
168- ).model_dump ()
169- except ValidationError as exc :
170- for error in exc .errors ():
171- location = int (error ["loc" ][0 ])
172- self ._errors .append (
173- SumstatError (row = i , loc = location , msg = error ["msg" ])
174- )
175-
176- if len (self ._errors ) >= self .MAX_ERRORS :
177- logger .critical (
178- f"Stopped validation after { self .MAX_ERRORS } errors"
179- )
180- break
181- else :
182- yield validated
183-
184149 @property
185150 def errors (self ) -> list [SumstatError ]:
186151 """Return all row errors encountered"""
@@ -259,11 +224,11 @@ def __iter__(self) -> Generator[ValidatedRow]:
259224 except ValidationError as exc :
260225 for error in exc .errors ():
261226 try :
262- location = int ( error ["loc" ][0 ])
227+ location = error ["loc" ][0 ]
263228 except IndexError :
264229 location = None
265230 self ._table .add_error (
266- SumstatError (row = i , loc = location , msg = error ["msg" ])
231+ SumstatError (row = i , column = location , msg = error ["msg" ])
267232 )
268233 yield ValidatedRow (row_number = i , is_valid = False )
269234
0 commit comments