Skip to content

Commit c5902b8

Browse files
committed
add to docstring, add more tests
1 parent 90f1a78 commit c5902b8

File tree

2 files changed

+53
-38
lines changed

2 files changed

+53
-38
lines changed

genie_registry/vcf.py

Lines changed: 28 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,7 @@ def _validate(self, vcfdf):
133133
)
134134
if vcfdf["FORMAT"].isnull().values.any():
135135
total_error += "vcf: Must not have missing values in FORMAT column.\n"
136-
total_error += self.validate_tumor_and_normal_sample_columns_exist(
137-
input_df=vcfdf
138-
)
136+
total_error += self.validate_tumor_and_normal_sample_columns(input_df=vcfdf)
139137
# Require that they report variants mapped to
140138
# either GRCh37 or hg19 without
141139
# the chr-prefix.
@@ -175,37 +173,33 @@ def _validate(self, vcfdf):
175173
# and output with warnings or errors if the format is not adhered too
176174
return total_error, warning
177175

178-
def validate_tumor_and_normal_sample_columns_exist(
179-
self, input_df: pd.DataFrame
180-
) -> str:
181-
"""Validates that the expected tumor sample column and optional normal
182-
sample columns are present in the VCF depending on how many
183-
columns you have present in the VCF and they have no missing values
184-
185-
Rules:
186-
- VCFs can only have a max of 11 columns including the 9 required columns
187-
- For 11 columns VCFs, it is assumed this is a matched tumor normal vcf file
188-
which means there should be a tumor sample and normal sample
189-
column present
190-
- For 10 column VCFs, it is assumed this is a single sample vcf file
191-
which means there should be a tumor sample column present
192-
- Anything lower than 10 columns is INVALID because you must have at
193-
least a tumor sample column on top of the 9 required VCF columns
194-
195-
- If tumor sample and normal sample columns are present, they must not have
196-
any missing values.
197-
198-
Examples:
199-
200-
VCF with Matched Tumor Normal columns:
201-
| GENIE-GOLD-1-1-tumor | GENIE-GOLD-1-1-normal |
202-
| -------------------- | --------------------- |
203-
| | |
204-
205-
VCF with Single Tumor VCF column:
206-
| TUMOR |
207-
| ----- |
208-
| |
176+
def validate_tumor_and_normal_sample_columns(self, input_df: pd.DataFrame) -> str:
177+
"""
178+
Validates that the expected tumor sample column and optional normal
179+
sample columns are present in the VCF depending on how many
180+
columns you have present in the VCF and they have no missing values
181+
182+
Rules:
183+
- VCFs can only have a max of 11 columns including the 9 required columns
184+
- For 11 columns VCFs, it is assumed this is a matched tumor normal vcf file
185+
which means there should be a tumor sample and normal sample
186+
column present
187+
- For 10 column VCFs, it is assumed this is a single sample vcf file
188+
which means there should be a tumor sample column present
189+
- Anything lower than 10 columns is INVALID because you must have at
190+
least a tumor sample column on top of the 9 required VCF columns
191+
- If tumor sample and/or normal sample columns are present, they must not have
192+
any missing values.
193+
194+
Example: VCF with Matched Tumor Normal columns
195+
| OTHER_VCF_COLUMNS | GENIE-GOLD-1-1-tumor | GENIE-GOLD-1-1-normal |
196+
| ----------------- | -------------------- | --------------------- |
197+
| ... | ... | ... |
198+
199+
Example: VCF with Single Tumor VCF column
200+
| OTHER_VCF_COLUMNS | TUMOR |
201+
| ----------------- | ----- |
202+
| ... | ... |
209203
210204
Args:
211205
input_df (pd.DataFrame): input vcf data to be validated

tests/test_vcf.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,26 @@ def test_validation_invalid_two_samples_normal(vcf_class):
198198
assert warning == ""
199199

200200

201+
def test_validation_invalid_format_has_nas(vcf_class):
202+
vcfDf = pd.DataFrame(
203+
{
204+
"#CHROM": ["2", "9", "12"],
205+
"POS": [69688533, 99401860, 53701241],
206+
"ID": ["AAK1", "AAED1", "AAAS"],
207+
"REF": ["AANT", "AACG", "AAAN"],
208+
"ALT": ["AAK1", "AAED1", "AAAS"],
209+
"QUAL": ["AAK1", "AAED1", "AAAS"],
210+
"FILTER": ["AAK1", "AAED1", "AAAS"],
211+
"INFO": ["AAK1", "AAED1", "AAAS"],
212+
"FORMAT": [None, "AAED1", "AAAS"],
213+
"TUMOR": ["AAK1", "AAED1", "AAAS"],
214+
}
215+
)
216+
error, warning = vcf_class._validate(vcfDf)
217+
assert error == "vcf: Must not have missing values in FORMAT column.\n"
218+
assert warning == ""
219+
220+
201221
def test_validation_invalid_white_space(vcf_class):
202222
vcfDf = pd.DataFrame(
203223
{
@@ -427,11 +447,11 @@ def test_that__get_dataframe_uses_correct_columns_to_replace(
427447
"FILTER",
428448
"INFO",
429449
"FORMAT",
430-
"GENIE-SAGE-1-1-normal",
431450
"GENIE-SAGE-1-1-tumor",
451+
"GENIE-SAGE-1-1-normal",
432452
],
433453
[[1, 2, 3, 4, 5, 6, 7, 8, 9, "a", None]],
434-
"vcf: Must not have missing values in GENIE-SAGE-1-1-tumor column.\n",
454+
"vcf: Must not have missing values in GENIE-SAGE-1-1-normal column.\n",
435455
),
436456
# Case 7: 11 columns, wrong normal/tumor naming
437457
(
@@ -480,13 +500,14 @@ def test_that__get_dataframe_uses_correct_columns_to_replace(
480500
"invalid_tumor_sample_name",
481501
"tumor_sample_col_has_nas",
482502
"valid_matched_tumor_normal",
503+
"normal_sample_col_has_nas",
483504
"invalid_normal_sample_name",
484505
"more_than_11_cols",
485506
],
486507
)
487-
def test_validate_tumor_and_normal_sample_columns_exist(
508+
def test_validate_tumor_and_normal_sample_columns(
488509
columns, data, expected_in_error, vcf_class
489510
):
490511
df = pd.DataFrame(data, columns=columns)
491-
result = vcf_class.validate_tumor_and_normal_sample_columns_exist(df)
512+
result = vcf_class.validate_tumor_and_normal_sample_columns(df)
492513
assert expected_in_error == result

0 commit comments

Comments
 (0)