@@ -111,14 +111,14 @@ def _validate(self, vcfdf):
111111 warning - warning messages
112112 """
113113 required_headers = pd .Series (
114- ["#CHROM" , "POS" , "ID" , "REF" , "ALT" , "QUAL" , "FILTER" , "INFO" ]
114+ ["#CHROM" , "POS" , "ID" , "REF" , "ALT" , "QUAL" , "FILTER" , "INFO" , "FORMAT" ]
115115 )
116116 total_error = ""
117117 warning = ""
118118 if not all (required_headers .isin (vcfdf .columns )):
119119 total_error += (
120120 "vcf: Must have these headers: "
121- "CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO.\n "
121+ "CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT .\n "
122122 )
123123 else :
124124 # No duplicated values
@@ -131,52 +131,9 @@ def _validate(self, vcfdf):
131131 "vcf: May contain rows that are "
132132 "space delimited instead of tab delimited.\n "
133133 )
134- # Vcf can only have max of 11 columns
135- if len (vcfdf .columns ) > 11 :
136- total_error += (
137- "vcf: Should not have more than 11 columns. Only "
138- "single sample or matched tumor normal vcf files are accepted.\n "
139- )
140- elif len (vcfdf .columns ) > 8 :
141- # If there are greater than 8 columns, there must be the FORMAT column
142- if "FORMAT" not in vcfdf .columns :
143- total_error += "vcf: Must have FORMAT header if sample columns exist.\n "
144- # If 11 columns, this is assumed to be a tumor normal vcf
145- if len (vcfdf .columns ) == 11 :
146- sample_id = vcfdf .columns [- 2 ]
147- normal_id = vcfdf .columns [- 1 ]
148- error = process_functions .validate_genie_identifier (
149- identifiers = pd .Series ([sample_id ]),
150- center = self .center ,
151- filename = "vcf" ,
152- col = "tumor sample column" ,
153- )
154- total_error += error
155- error = process_functions .validate_genie_identifier (
156- identifiers = pd .Series ([normal_id ]),
157- center = self .center ,
158- filename = "vcf" ,
159- col = "normal sample column" ,
160- )
161- total_error += error
162- else :
163- # Everything else above 8 columns that isn't 11 columns
164- # will be assumed to be a single sample vcf.
165- # if TUMOR is not the sample column header, then validate
166- # the sample column header.
167- if "TUMOR" not in vcfdf .columns :
168- sample_id = vcfdf .columns [- 1 ]
169- error = process_functions .validate_genie_identifier (
170- identifiers = pd .Series ([sample_id ]),
171- center = self .center ,
172- filename = "vcf" ,
173- col = "tumor sample column" ,
174- )
175- if error :
176- error = error .replace ("\n " , "" )
177- error += " if vcf represents a single sample and TUMOR is not the sample column header.\n "
178- total_error += error
179-
134+ if vcfdf ["FORMAT" ].isnull ().values .any ():
135+ total_error += "vcf: Must not have missing values in FORMAT column.\n "
136+ total_error += self .validate_tumor_and_normal_sample_columns (input_df = vcfdf )
180137 # Require that they report variants mapped to
181138 # either GRCh37 or hg19 without
182139 # the chr-prefix.
@@ -215,3 +172,102 @@ def _validate(self, vcfdf):
215172 # will parse a VCF in a detailed way,
216173 # and output with warnings or errors if the format is not adhered too
217174 return total_error , warning
175+
176+ def validate_tumor_and_normal_sample_columns (self , input_df : pd .DataFrame ) -> str :
177+ """
178+ Validates that the expected tumor sample column and optional normal
179+ sample columns are present in the VCF depending on how many
180+ columns you have present in the VCF and they have no missing values
181+
182+ Rules:
183+ - VCFs can only have a max of 11 columns including the 9 required columns
184+ - For 11 columns VCFs, it is assumed this is a matched tumor normal vcf file
185+ which means there should be a tumor sample and normal sample
186+ column present
187+ - For 10 column VCFs, it is assumed this is a single sample vcf file
188+ which means there should be a tumor sample column present
189+ - Anything lower than 10 columns is INVALID because you must have at
190+ least a tumor sample column on top of the 9 required VCF columns
191+ - If tumor sample and/or normal sample columns are present, they must not have
192+ any missing values.
193+
194+ Example: VCF with Matched Tumor Normal columns
195+ | OTHER_VCF_COLUMNS | GENIE-GOLD-1-1-tumor | GENIE-GOLD-1-1-normal |
196+ | ----------------- | -------------------- | --------------------- |
197+ | ... | ... | ... |
198+
199+ Example: VCFs with Single Sample Column
200+ | OTHER_VCF_COLUMNS | TUMOR |
201+ | ----------------- | ----- |
202+ | ... | ... |
203+
204+ | OTHER_VCF_COLUMNS | GENIE-GOLD-1-1 |
205+ | ----------------- | -------------- |
206+ | ... | ... |
207+
208+
209+ Args:
210+ input_df (pd.DataFrame): input vcf data to be validated
211+
212+ Returns:
213+ str: error message
214+ """
215+ error = ""
216+ sample_id = None
217+ normal_id = None
218+ # vcf can only have max of 11 columns
219+ if len (input_df .columns ) > 11 :
220+ error = (
221+ "vcf: Should not have more than 11 columns. Only "
222+ "single sample or matched tumor normal vcf files are accepted.\n "
223+ )
224+ # If 11 columns, this is assumed to be a tumor normal vcf
225+ # so it must have tumor sample and normal sample columns
226+ elif len (input_df .columns ) == 11 :
227+ sample_id = input_df .columns [- 2 ]
228+ normal_id = input_df .columns [- 1 ]
229+ error = process_functions .validate_genie_identifier (
230+ identifiers = pd .Series ([sample_id ]),
231+ center = self .center ,
232+ filename = "vcf" ,
233+ col = "tumor sample column" ,
234+ )
235+ error += process_functions .validate_genie_identifier (
236+ identifiers = pd .Series ([normal_id ]),
237+ center = self .center ,
238+ filename = "vcf" ,
239+ col = "normal sample column" ,
240+ )
241+ elif len (input_df .columns ) == 10 :
242+ # If 10 columns, it will be assumed to be a single sample vcf.
243+ # if TUMOR is not the sample column header, then validate
244+ # the sample column header.
245+ if "TUMOR" not in input_df .columns :
246+ sample_id = input_df .columns [- 1 ]
247+ error = process_functions .validate_genie_identifier (
248+ identifiers = pd .Series ([sample_id ]),
249+ center = self .center ,
250+ filename = "vcf" ,
251+ col = "tumor sample column" ,
252+ )
253+ if error :
254+ error = error .replace ("\n " , "" )
255+ error += " if vcf represents a single sample and TUMOR is not the sample column header.\n "
256+ else :
257+ sample_id = "TUMOR"
258+ else :
259+ # Must have a column called TUMOR or sample column in the header
260+ error = (
261+ "vcf: Must have at least 10 columns. "
262+ "If the vcf represents a single sample, then it's missing a tumor sample column. "
263+ "If the vcf represents a matched tumor normal, then it's missing both normal sample and tumor sample columns.\n "
264+ )
265+
266+ # validate the values in the tumor and/or normal sample columns if present
267+ if process_functions .checkColExist (input_df , key = sample_id ):
268+ if input_df [sample_id ].isnull ().values .any ():
269+ error += f"vcf: Must not have missing values in { sample_id } column.\n "
270+ if process_functions .checkColExist (input_df , key = normal_id ):
271+ if input_df [normal_id ].isnull ().values .any ():
272+ error += f"vcf: Must not have missing values in { normal_id } column.\n "
273+ return error
0 commit comments