Skip to content

Commit 1d07796

Browse files
authored
Merge branch 'develop' into gen-2380-update-docs
2 parents c8aa7f0 + 0bf8d69 commit 1d07796

File tree

2 files changed

+295
-73
lines changed

2 files changed

+295
-73
lines changed

genie_registry/vcf.py

Lines changed: 104 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,14 @@ def _validate(self, vcfdf):
111111
warning - warning messages
112112
"""
113113
required_headers = pd.Series(
114-
["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
114+
["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
115115
)
116116
total_error = ""
117117
warning = ""
118118
if not all(required_headers.isin(vcfdf.columns)):
119119
total_error += (
120120
"vcf: Must have these headers: "
121-
"CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO.\n"
121+
"CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT.\n"
122122
)
123123
else:
124124
# No duplicated values
@@ -131,52 +131,9 @@ def _validate(self, vcfdf):
131131
"vcf: May contain rows that are "
132132
"space delimited instead of tab delimited.\n"
133133
)
134-
# Vcf can only have max of 11 columns
135-
if len(vcfdf.columns) > 11:
136-
total_error += (
137-
"vcf: Should not have more than 11 columns. Only "
138-
"single sample or matched tumor normal vcf files are accepted.\n"
139-
)
140-
elif len(vcfdf.columns) > 8:
141-
# If there are greater than 8 columns, there must be the FORMAT column
142-
if "FORMAT" not in vcfdf.columns:
143-
total_error += "vcf: Must have FORMAT header if sample columns exist.\n"
144-
# If 11 columns, this is assumed to be a tumor normal vcf
145-
if len(vcfdf.columns) == 11:
146-
sample_id = vcfdf.columns[-2]
147-
normal_id = vcfdf.columns[-1]
148-
error = process_functions.validate_genie_identifier(
149-
identifiers=pd.Series([sample_id]),
150-
center=self.center,
151-
filename="vcf",
152-
col="tumor sample column",
153-
)
154-
total_error += error
155-
error = process_functions.validate_genie_identifier(
156-
identifiers=pd.Series([normal_id]),
157-
center=self.center,
158-
filename="vcf",
159-
col="normal sample column",
160-
)
161-
total_error += error
162-
else:
163-
# Everything else above 8 columns that isn't 11 columns
164-
# will be assumed to be a single sample vcf.
165-
# if TUMOR is not the sample column header, then validate
166-
# the sample column header.
167-
if "TUMOR" not in vcfdf.columns:
168-
sample_id = vcfdf.columns[-1]
169-
error = process_functions.validate_genie_identifier(
170-
identifiers=pd.Series([sample_id]),
171-
center=self.center,
172-
filename="vcf",
173-
col="tumor sample column",
174-
)
175-
if error:
176-
error = error.replace("\n", "")
177-
error += " if vcf represents a single sample and TUMOR is not the sample column header.\n"
178-
total_error += error
179-
134+
if vcfdf["FORMAT"].isnull().values.any():
135+
total_error += "vcf: Must not have missing values in FORMAT column.\n"
136+
total_error += self.validate_tumor_and_normal_sample_columns(input_df=vcfdf)
180137
# Require that they report variants mapped to
181138
# either GRCh37 or hg19 without
182139
# the chr-prefix.
@@ -215,3 +172,102 @@ def _validate(self, vcfdf):
215172
# will parse a VCF in a detailed way,
216173
# and output with warnings or errors if the format is not adhered too
217174
return total_error, warning
175+
176+
def validate_tumor_and_normal_sample_columns(self, input_df: pd.DataFrame) -> str:
177+
"""
178+
Validates that the expected tumor sample column and optional normal
179+
sample columns are present in the VCF depending on how many
180+
columns you have present in the VCF and they have no missing values
181+
182+
Rules:
183+
- VCFs can only have a max of 11 columns including the 9 required columns
184+
- For 11 columns VCFs, it is assumed this is a matched tumor normal vcf file
185+
which means there should be a tumor sample and normal sample
186+
column present
187+
- For 10 column VCFs, it is assumed this is a single sample vcf file
188+
which means there should be a tumor sample column present
189+
- Anything lower than 10 columns is INVALID because you must have at
190+
least a tumor sample column on top of the 9 required VCF columns
191+
- If tumor sample and/or normal sample columns are present, they must not have
192+
any missing values.
193+
194+
Example: VCF with Matched Tumor Normal columns
195+
| OTHER_VCF_COLUMNS | GENIE-GOLD-1-1-tumor | GENIE-GOLD-1-1-normal |
196+
| ----------------- | -------------------- | --------------------- |
197+
| ... | ... | ... |
198+
199+
Example: VCFs with Single Sample Column
200+
| OTHER_VCF_COLUMNS | TUMOR |
201+
| ----------------- | ----- |
202+
| ... | ... |
203+
204+
| OTHER_VCF_COLUMNS | GENIE-GOLD-1-1 |
205+
| ----------------- | -------------- |
206+
| ... | ... |
207+
208+
209+
Args:
210+
input_df (pd.DataFrame): input vcf data to be validated
211+
212+
Returns:
213+
str: error message
214+
"""
215+
error = ""
216+
sample_id = None
217+
normal_id = None
218+
# vcf can only have max of 11 columns
219+
if len(input_df.columns) > 11:
220+
error = (
221+
"vcf: Should not have more than 11 columns. Only "
222+
"single sample or matched tumor normal vcf files are accepted.\n"
223+
)
224+
# If 11 columns, this is assumed to be a tumor normal vcf
225+
# so it must have tumor sample and normal sample columns
226+
elif len(input_df.columns) == 11:
227+
sample_id = input_df.columns[-2]
228+
normal_id = input_df.columns[-1]
229+
error = process_functions.validate_genie_identifier(
230+
identifiers=pd.Series([sample_id]),
231+
center=self.center,
232+
filename="vcf",
233+
col="tumor sample column",
234+
)
235+
error += process_functions.validate_genie_identifier(
236+
identifiers=pd.Series([normal_id]),
237+
center=self.center,
238+
filename="vcf",
239+
col="normal sample column",
240+
)
241+
elif len(input_df.columns) == 10:
242+
# If 10 columns, it will be assumed to be a single sample vcf.
243+
# if TUMOR is not the sample column header, then validate
244+
# the sample column header.
245+
if "TUMOR" not in input_df.columns:
246+
sample_id = input_df.columns[-1]
247+
error = process_functions.validate_genie_identifier(
248+
identifiers=pd.Series([sample_id]),
249+
center=self.center,
250+
filename="vcf",
251+
col="tumor sample column",
252+
)
253+
if error:
254+
error = error.replace("\n", "")
255+
error += " if vcf represents a single sample and TUMOR is not the sample column header.\n"
256+
else:
257+
sample_id = "TUMOR"
258+
else:
259+
# Must have a column called TUMOR or sample column in the header
260+
error = (
261+
"vcf: Must have at least 10 columns. "
262+
"If the vcf represents a single sample, then it's missing a tumor sample column. "
263+
"If the vcf represents a matched tumor normal, then it's missing both normal sample and tumor sample columns.\n"
264+
)
265+
266+
# validate the values in the tumor and/or normal sample columns if present
267+
if process_functions.checkColExist(input_df, key=sample_id):
268+
if input_df[sample_id].isnull().values.any():
269+
error += f"vcf: Must not have missing values in {sample_id} column.\n"
270+
if process_functions.checkColExist(input_df, key=normal_id):
271+
if input_df[normal_id].isnull().values.any():
272+
error += f"vcf: Must not have missing values in {normal_id} column.\n"
273+
return error

0 commit comments

Comments
 (0)