|
11 | 11 | logger = logging.getLogger(__name__) |
12 | 12 |
|
13 | 13 |
|
14 | | -def _check_allele_col_validity(df): |
15 | | - """There are two linked validation rules in this function: |
16 | | -
|
17 | | - 1) If maf file has ALL three of the following columns: |
18 | | - - TUMOR_SEQ_ALLELE1 (TSA1) |
19 | | - - TUMOR_SEQ_ALLELE2 (TSA2) |
20 | | - - REFERENCE ALLELE (REF) |
21 | | - THEN |
22 | | - ALL rows of TSA1 must equal REF |
23 | | - OR |
24 | | - ALL rows of TSA1 must equal TSA2 |
25 | | -
|
26 | | - TSA1 is used by Genome Nexus (GN) to annotate data when it senses there is ambiguity |
27 | | - regarding which variant (TSA1 vs TSA2) to use. This is |
28 | | - why there cannot be mixed rows where some rows have TSA1 == REF and some rows |
29 | | - have TSA1 == TSA2. |
30 | | -
|
31 | | - e.g: |
32 | | - VALID |
33 | | - | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
34 | | - | C | C | A |
35 | | - | T | T | C |
36 | | -
|
37 | | - VALID |
38 | | - | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
39 | | - | C | A | A |
40 | | - | T | C | C |
41 | | -
|
42 | | - INVALID |
43 | | - | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
44 | | - | C | C | A |
45 | | - | C | A | A |
46 | | -
|
47 | | - See https://github.com/genome-nexus/annotation-tools/issues/26 for |
48 | | - more background regarding why this validation rule was implemented. |
49 | | -
|
50 | | - 2) There can't be ANY rows where REF == TSA2. This is a missense mutation |
51 | | - flagged as invalid by GN |
| 14 | +def _check_allele_col_validity(df: pd.DataFrame) -> str: |
| 15 | + """ |
| 16 | + This function checks specific columns in a MAF (Mutation Annotation Format) |
| 17 | + file for certain conditions. |
| 18 | +
|
| 19 | + The following conditions must be met: |
| 20 | + **If the MAF file has all three of these columns** |
| 21 | +
|
| 22 | + - TUMOR_SEQ_ALLELE1 (TSA1) |
| 23 | + - TUMOR_SEQ_ALLELE2 (TSA2) |
| 24 | + - REFERENCE_ALLELE (REF) |
| 25 | +
|
| 26 | + **Then, one of the following must be true** |
| 27 | +
|
| 28 | + - Every value in TSA1 must be the same as the value in REF |
| 29 | + - Every value in TSA1 must be the same as the value in TSA2 |
| 30 | +
|
| 31 | + **Additionally, if the MAF file has at least these two columns** |
| 32 | +
|
| 33 | + - REFERENCE_ALLELE (REF) |
| 34 | + - TUMOR_SEQ_ALLELE2 (TSA2) |
| 35 | +
|
| 36 | + **Then** |
| 37 | +
|
| 38 | + NO values in REF can match TSA2 |
| 39 | +
|
| 40 | + These rules are important because Genome Nexus (GN) uses `TSA1` to annotate data |
| 41 | + when it's not clear which variant to use. So, there can't be a mix of rows where |
| 42 | + some have `TSA1` equal to `REF` and some have `TSA1` equal to `TSA2`. |
| 43 | +
|
| 44 | + Example: Valid Examples |
| 45 | + ``` |
| 46 | + | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 | |
| 47 | + | ---------------- | ----------------- | ----------------- | |
| 48 | + | C | C | A | |
| 49 | + | T | T | C | |
| 50 | + ``` |
| 51 | +
|
| 52 | + ``` |
| 53 | + | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 | |
| 54 | + | ---------------- | ----------------- | ----------------- | |
| 55 | + | C | A | A | |
| 56 | + | T | C | C | |
| 57 | + ``` |
| 58 | +
|
| 59 | + ``` |
| 60 | + | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 | |
| 61 | + | ---------------- | ----------------- | |
| 62 | + | C | A | |
| 63 | + | T | C | |
| 64 | + ``` |
| 65 | +
|
| 66 | +
|
| 67 | + Example: Invalid Examples |
| 68 | + ``` |
| 69 | + | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 | |
| 70 | + | ---------------- | ----------------- | ----------------- | |
| 71 | + | C | C | A | |
| 72 | + | C | A | A | |
| 73 | + ``` |
| 74 | +
|
| 75 | + ``` |
| 76 | + | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 | |
| 77 | + | ---------------- | ----------------- | ----------------- | |
| 78 | + | A | C | A | |
| 79 | + | T | C | T | |
| 80 | + ``` |
| 81 | +
|
| 82 | + ``` |
| 83 | + | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 | |
| 84 | + | ---------------- | ----------------- | |
| 85 | + | C | C | |
| 86 | + | T | C | |
| 87 | + ``` |
| 88 | +
|
| 89 | +
|
| 90 | + See this [Genome Nexus issue](https://github.com/genome-nexus/annotation-tools/issues/26) for |
| 91 | + more background regarding why this validation rule was implemented. |
| 92 | +
|
| 93 | + Args: |
| 94 | + df: input mutation dataframe |
| 95 | +
|
| 96 | + Returns: |
| 97 | + str: the error message |
52 | 98 | """ |
53 | 99 | tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2") |
54 | 100 | tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1") |
@@ -135,6 +181,32 @@ def _validate(self, mutationDF): |
135 | 181 | This function validates the mutation file to make sure it |
136 | 182 | adheres to the mutation SOP. |
137 | 183 |
|
| 184 | + t_depth: This column is conditionally optional. |
| 185 | + 1. If this column is missing, the data must include the t_ref_count column. Otherwise, it will cause a validation error. |
| 186 | + 2. If this column is present, it must have one of the following: |
| 187 | + - A mix of numeric values and NAs |
| 188 | + - All NAs |
| 189 | + - All numeric values |
| 190 | +
|
| 191 | + There are no other checks on the actual values in this column. |
| 192 | +
|
| 193 | + t_ref_count: This column is conditionally optional. |
| 194 | + 1. If this column is missing, the data must include the t_depth column. Otherwise, it will cause a validation error. |
| 195 | + 2. If this column is present, it must have one of the following: |
| 196 | + - A mix of numeric values and NAs |
| 197 | + - All NAs |
| 198 | + - All numeric values |
| 199 | +
|
| 200 | + There are no other checks on the actual values in this column. |
| 201 | +
|
| 202 | + t_alt_count: This column is entirely optional. |
| 203 | + 1. If this column is present, it must have one of the following: |
| 204 | + - A mix of numeric values and NAs |
| 205 | + - All NAs |
| 206 | + - All numeric values |
| 207 | +
|
| 208 | + There are no other checks on the actual values in this column. |
| 209 | +
|
138 | 210 | Args: |
139 | 211 | mutationDF: mutation dataframe |
140 | 212 |
|
|
0 commit comments